[MAJOR] modularize the polling mechanisms
select, poll and epoll now have their dedicated functions and have
been split into distinct files. Several FD manipulation primitives
have been provided with each poller.
The rest of the code needs to be cleaned to remove traces of
StaticReadEvent/StaticWriteEvent. A trick involving a macro has
temporarily been used right now. Some work needs to be done to
factorize tests and sets everywhere.
diff --git a/Makefile b/Makefile
index faa37c0..d3d5c97 100644
--- a/Makefile
+++ b/Makefile
@@ -11,6 +11,16 @@
#TARGET = linux22
#TARGET = solaris
+USE_POLL = 1
+
+ifeq ($(TARGET),linux24e)
+USE_EPOLL = 1
+endif
+
+ifeq ($(TARGET),linux26)
+USE_EPOLL = 1
+endif
+
# pass CPU=<cpu_name> to make to optimize for a particular CPU
CPU = generic
#CPU = i586
@@ -35,26 +45,26 @@
TCPSPLICEDIR :=
# This is for standard Linux 2.6 with netfilter and epoll()
-COPTS.linux26 = -DNETFILTER -DENABLE_POLL -DENABLE_EPOLL
+COPTS.linux26 = -DNETFILTER
LIBS.linux26 =
# This is for enhanced Linux 2.4 with netfilter and epoll() patch.
# Warning! If kernel is 2.4 with epoll-lt <= 0.21, then you must add
# -DEPOLL_CTL_MOD_WORKAROUND to workaround a very rare bug.
-#COPTS.linux24e = -DNETFILTER -DENABLE_POLL -DENABLE_EPOLL -DUSE_MY_EPOLL -DEPOLL_CTL_MOD_WORKAROUND
-COPTS.linux24e = -DNETFILTER -DENABLE_POLL -DENABLE_EPOLL -DUSE_MY_EPOLL
+#COPTS.linux24e = -DNETFILTER -DUSE_MY_EPOLL -DEPOLL_CTL_MOD_WORKAROUND
+COPTS.linux24e = -DNETFILTER -DUSE_MY_EPOLL
LIBS.linux24e =
# This is for standard Linux 2.4 with netfilter but without epoll()
-COPTS.linux24 = -DNETFILTER -DENABLE_POLL
+COPTS.linux24 = -DNETFILTER
LIBS.linux24 =
# This is for Linux 2.2
-COPTS.linux22 = -DUSE_GETSOCKNAME -DENABLE_POLL
+COPTS.linux22 = -DUSE_GETSOCKNAME
LIBS.linux22 =
# This is for Solaris 8
-COPTS.solaris = -fomit-frame-pointer -DENABLE_POLL -DFD_SETSIZE=65536
+COPTS.solaris = -fomit-frame-pointer -DFD_SETSIZE=65536
LIBS.solaris = -lnsl -lsocket
# CPU dependant optimizations
@@ -92,7 +102,6 @@
ADDLIB =
# set some defines when needed.
-# Known ones are -DENABLE_POLL, -DENABLE_EPOLL, and -DUSE_MY_EPOLL
# - use -DTPROXY to compile with transparent proxy support.
DEFINE = -DTPROXY
@@ -136,10 +145,12 @@
ifneq ($(USE_POLL),)
OPTIONS += -DENABLE_POLL
+OPT_OBJS += src/ev_poll.o
endif
ifneq ($(USE_EPOLL),)
OPTIONS += -DENABLE_EPOLL
+OPT_OBJS += src/ev_epoll.o
endif
ifneq ($(USE_MY_EPOLL),)
@@ -199,7 +210,7 @@
src/time.o src/fd.o src/regex.o src/cfgparse.o src/server.o \
src/checks.o src/queue.o src/capture.o src/client.o src/proxy.o \
src/proto_http.o src/stream_sock.o src/appsession.o src/backend.o \
- src/session.o src/hdr_idx.o src/rbtree.o
+ src/session.o src/hdr_idx.o src/rbtree.o src/ev_select.o
haproxy: $(OBJS) $(OPT_OBJS)
$(LD) $(LDFLAGS) -o $@ $^ $(LIBS)
diff --git a/Makefile.bsd b/Makefile.bsd
index 67f51d9..043ff04 100644
--- a/Makefile.bsd
+++ b/Makefile.bsd
@@ -87,7 +87,7 @@
src/time.o src/fd.o src/regex.o src/cfgparse.o src/server.o \
src/checks.o src/queue.o src/capture.o src/client.o src/proxy.o \
src/proto_http.o src/stream_sock.o src/appsession.o src/backend.o \
- src/session.o src/hdr_idx.o src/rbtree.o
+ src/session.o src/hdr_idx.o src/rbtree.o src/ev_select.o src/ev_poll.o
all: haproxy
diff --git a/Makefile.osx b/Makefile.osx
index a4fda97..cb79d57 100644
--- a/Makefile.osx
+++ b/Makefile.osx
@@ -87,7 +87,7 @@
src/time.o src/fd.o src/regex.o src/cfgparse.o src/server.o \
src/checks.o src/queue.o src/capture.o src/client.o src/proxy.o \
src/proto_http.o src/stream_sock.o src/appsession.o src/backend.o \
- src/session.o src/hdr_idx.o src/rbtree.o
+ src/session.o src/hdr_idx.o src/rbtree.o src/ev_select.o src/ev_poll.o
all: haproxy
diff --git a/include/proto/fd.h b/include/proto/fd.h
index e41fcd1..03a7add 100644
--- a/include/proto/fd.h
+++ b/include/proto/fd.h
@@ -2,7 +2,7 @@
include/proto/fd.h
File descriptors states.
- Copyright (C) 2000-2006 Willy Tarreau - w@1wt.eu
+ Copyright (C) 2000-2007 Willy Tarreau - w@1wt.eu
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -34,29 +34,41 @@
*/
void fd_delete(int fd);
+/* registers all known pollers */
+void register_pollers();
+
+/* disable the specified poller */
+void disable_poller(const char *poller_name);
/*
- * Benchmarks performed on a Pentium-M notebook show that using functions
- * instead of the usual macros improve the FD_* performance by about 80%,
- * and that marking them regparm(2) adds another 20%.
+ * Initialize the pollers till the best one is found.
+ * If none works, returns 0, otherwise 1.
*/
-#if defined(CONFIG_HAP_INLINE_FD_SET)
+int init_pollers();
-# define MY_FD_SET FD_SET
-# define MY_FD_CLR FD_CLR
-# define MY_FD_ISSET FD_ISSET
+/*
+ * Runs the polling loop
+ */
+void run_poller();
-#else
-# define MY_FD_SET my_fd_set
-# define MY_FD_CLR my_fd_clr
-# define MY_FD_ISSET my_fd_isset
+/* FIXME: dirty hack during code transition */
+#define dir_StaticWriteEvent DIR_WR
+#define dir_StaticReadEvent DIR_RD
+#define dir_DIR_RD DIR_RD
+#define dir_DIR_WR DIR_WR
-REGPRM2 void my_fd_set(const int fd, fd_set *ev);
-REGPRM2 void my_fd_clr(const int fd, fd_set *ev);
-REGPRM2 int my_fd_isset(const int fd, const fd_set *ev);
+#define MY_FD_SET(fd, ev) (cur_poller.set((fd), dir_##ev))
+#define MY_FD_CLR(fd, ev) (cur_poller.clr((fd), dir_##ev))
+#define MY_FD_ISSET(fd, ev) (cur_poller.isset((fd), dir_##ev))
-#endif
+#define EV_FD_SET(fd, ev) (cur_poller.set((fd), dir_##ev))
+#define EV_FD_CLR(fd, ev) (cur_poller.clr((fd), dir_##ev))
+#define EV_FD_ISSET(fd, ev) (cur_poller.isset((fd), dir_##ev))
+#define EV_FD_COND_S(fd, ev) (cur_poller.cond_s((fd), dir_##ev))
+#define EV_FD_COND_C(fd, ev) (cur_poller.cond_c((fd), dir_##ev))
+#define EV_FD_REM(fd) (cur_poller.rem(fd))
+#define EV_FD_CLO(fd) (cur_poller.clo(fd))
/* recomputes the maxfd limit from the fd */
diff --git a/include/types/fd.h b/include/types/fd.h
index ae8872b..6d8b31f 100644
--- a/include/types/fd.h
+++ b/include/types/fd.h
@@ -53,6 +53,43 @@
int state; /* the state of this fd */
};
+/*
+ * Poller descriptors.
+ * - <name> is initialized by the poller's register() function, and should not
+ * be allocated, just linked to.
+ * - <pref> is initialized by the poller's register() function. It is set to 0
+ * by default, meaning the poller is disabled. init() should set it to 0 in
+ * case of failure. term() must set it to 0. A generic unoptimized select()
+ * poller should set it to 100.
+ * - <private> is initialized by the poller's init() function, and cleaned by
+ * the term() function.
+ * - cond_s() checks if fd was not set then sets it and returns 1. Otherwise 0.
+ * - cond_c() checks if fd was set then clears it and returns 1. Otherwise 0.
+ * - clo() should be used to do indicate the poller that fd will be closed. It
+ * may be the same as rem() on some pollers.
+ * - poll() calls the poller, waiting at most wait_time ms.
+ */
+struct poller {
+ void *private; /* any private data for the poller */
+ REGPRM2 int (*isset)(const int fd, const int dir); /* check if <fd> is being polled for dir <dir> */
+ REGPRM2 void (*set)(const int fd, const int dir); /* set polling on <fd> for <dir> */
+ REGPRM2 void (*clr)(const int fd, const int dir); /* clear polling on <fd> for <dir> */
+ REGPRM2 int (*cond_s)(const int fd, const int dir); /* set polling on <fd> for <dir> if unset */
+ REGPRM2 int (*cond_c)(const int fd, const int dir); /* clear polling on <fd> for <dir> if set */
+ REGPRM1 void (*rem)(const int fd); /* remove any polling on <fd> */
+ REGPRM1 void (*clo)(const int fd); /* mark <fd> as closed */
+ REGPRM2 void (*poll)(struct poller *p, int wait_time); /* the poller itself */
+ REGPRM1 int (*init)(struct poller *p); /* poller initialization */
+ REGPRM1 void (*term)(struct poller *p); /* termination of this poller */
+ const char *name; /* poller name */
+ int pref; /* try pollers with higher preference first */
+};
+
+extern struct poller cur_poller; /* the current poller */
+extern int nbpollers;
+#define MAX_POLLERS 10
+extern struct poller pollers[MAX_POLLERS]; /* all registered pollers */
+
extern struct fdtab *fdtab; /* array of all the file descriptors */
extern int maxfd; /* # of the highest fd + 1 */
extern int totalconn; /* total # of terminated sessions */
diff --git a/include/types/polling.h b/include/types/polling.h
index 821698e..ed3cf64 100644
--- a/include/types/polling.h
+++ b/include/types/polling.h
@@ -2,7 +2,7 @@
include/types/polling.h
File descriptors and polling definitions.
- Copyright (C) 2000-2006 Willy Tarreau - w@1wt.eu
+ Copyright (C) 2000-2007 Willy Tarreau - w@1wt.eu
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -53,8 +53,6 @@
#define POLL_USE_POLL (1<<1)
#define POLL_USE_EPOLL (1<<2)
-/* fd states */
-extern fd_set *StaticReadEvent, *StaticWriteEvent;
extern int cfg_polling_mechanism; /* POLL_USE_{SELECT|POLL|EPOLL} */
diff --git a/src/ev_epoll.c b/src/ev_epoll.c
new file mode 100644
index 0000000..ff49505
--- /dev/null
+++ b/src/ev_epoll.c
@@ -0,0 +1,355 @@
+/*
+ * FD polling functions for linux epoll()
+ *
+ * Copyright 2000-2007 Willy Tarreau <w@1wt.eu>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#include <common/compat.h>
+#include <common/config.h>
+#include <common/time.h>
+
+#include <types/fd.h>
+#include <types/global.h>
+
+#include <proto/fd.h>
+#include <proto/polling.h>
+#include <proto/task.h>
+
+#if defined(USE_MY_EPOLL)
+#include <errno.h>
+#include <sys/syscall.h>
+_syscall1 (int, epoll_create, int, size);
+_syscall4 (int, epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event *, event);
+_syscall4 (int, epoll_wait, int, epfd, struct epoll_event *, events, int, maxevents, int, timeout);
+#endif
+
+
+static fd_set *StaticReadEvent, *StaticWriteEvent;
+static fd_set *PrevReadEvent, *PrevWriteEvent;
+
+/* private data */
+static struct epoll_event *epoll_events;
+static int epoll_fd;
+
+
+/*
+ * Benchmarks performed on a Pentium-M notebook show that using functions
+ * instead of the usual macros improve the FD_* performance by about 80%,
+ * and that marking them regparm(2) adds another 20%.
+ */
+REGPRM2 static int __fd_isset(const int fd, const int dir)
+{
+ fd_set *ev;
+ if (dir == DIR_RD)
+ ev = StaticReadEvent;
+ else
+ ev = StaticWriteEvent;
+
+ return FD_ISSET(fd, ev);
+}
+
+REGPRM2 static void __fd_set(const int fd, const int dir)
+{
+ fd_set *ev;
+ if (dir == DIR_RD)
+ ev = StaticReadEvent;
+ else
+ ev = StaticWriteEvent;
+
+ FD_SET(fd, ev);
+}
+
+REGPRM2 static void __fd_clr(const int fd, const int dir)
+{
+ fd_set *ev;
+ if (dir == DIR_RD)
+ ev = StaticReadEvent;
+ else
+ ev = StaticWriteEvent;
+
+ FD_CLR(fd, ev);
+}
+
+REGPRM2 static int __fd_cond_s(const int fd, const int dir)
+{
+ int ret;
+ fd_set *ev;
+ if (dir == DIR_RD)
+ ev = StaticReadEvent;
+ else
+ ev = StaticWriteEvent;
+
+ ret = !FD_ISSET(fd, ev);
+ if (ret)
+ FD_SET(fd, ev);
+ return ret;
+}
+
+REGPRM2 static int __fd_cond_c(const int fd, const int dir)
+{
+ int ret;
+ fd_set *ev;
+ if (dir == DIR_RD)
+ ev = StaticReadEvent;
+ else
+ ev = StaticWriteEvent;
+
+ ret = FD_ISSET(fd, ev);
+ if (ret)
+ FD_CLR(fd, ev);
+ return ret;
+}
+
+REGPRM1 static void __fd_rem(const int fd)
+{
+ FD_CLR(fd, StaticReadEvent);
+ FD_CLR(fd, StaticWriteEvent);
+}
+
+REGPRM1 static void __fd_clo(const int fd)
+{
+ FD_CLR(fd, StaticReadEvent);
+ FD_CLR(fd, StaticWriteEvent);
+ FD_CLR(fd, PrevReadEvent);
+ FD_CLR(fd, PrevWriteEvent);
+}
+
+
+
+/*
+ * Initialization of the epoll() poller.
+ * Returns 0 in case of failure, non-zero in case of success. If it fails, it
+ * disables the poller by setting its pref to 0.
+ */
+REGPRM1 static int epoll_init(struct poller *p)
+{
+ __label__ fail_pwevt, fail_prevt, fail_swevt, fail_srevt, fail_ee, fail_fd;
+ int fd_set_bytes;
+
+ p->private = NULL;
+ fd_set_bytes = sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE;
+
+ epoll_fd = epoll_create(global.maxsock + 1);
+ if (epoll_fd < 0)
+ goto fail_fd;
+
+ epoll_events = (struct epoll_event*)
+ calloc(1, sizeof(struct epoll_event) * global.maxsock);
+
+ if (epoll_events == NULL)
+ goto fail_ee;
+
+ if ((PrevReadEvent = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
+ goto fail_prevt;
+
+ if ((PrevWriteEvent = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
+ goto fail_pwevt;
+
+ if ((StaticReadEvent = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
+ goto fail_srevt;
+
+ if ((StaticWriteEvent = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
+ goto fail_swevt;
+
+ return 1;
+
+ fail_swevt:
+ free(StaticReadEvent);
+ fail_srevt:
+ free(PrevWriteEvent);
+ fail_pwevt:
+ free(PrevReadEvent);
+ fail_prevt:
+ free(epoll_events);
+ fail_ee:
+ close(epoll_fd);
+ epoll_fd = 0;
+ fail_fd:
+ p->pref = 0;
+ return 0;
+}
+
+/*
+ * Termination of the epoll() poller.
+ * Memory is released and the poller is marked as unselectable.
+ */
+REGPRM1 static void epoll_term(struct poller *p)
+{
+ if (StaticWriteEvent)
+ free(StaticWriteEvent);
+
+ if (StaticReadEvent)
+ free(StaticReadEvent);
+
+ if (PrevWriteEvent)
+ free(PrevWriteEvent);
+
+ if (PrevReadEvent)
+ free(PrevReadEvent);
+
+ if (epoll_events)
+ free(epoll_events);
+
+ close(epoll_fd);
+ epoll_fd = 0;
+
+ p->private = NULL;
+ p->pref = 0;
+}
+
+/*
+ * epoll() poller
+ */
+REGPRM2 static void epoll_poll(struct poller *p, int wait_time)
+{
+ int status;
+ int fd;
+
+ int fds, count;
+ int pr, pw, sr, sw;
+ unsigned rn, ro, wn, wo; /* read new, read old, write new, write old */
+ struct epoll_event ev;
+
+ for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
+
+ rn = ((int*)StaticReadEvent)[fds]; ro = ((int*)PrevReadEvent)[fds];
+ wn = ((int*)StaticWriteEvent)[fds]; wo = ((int*)PrevWriteEvent)[fds];
+
+ if ((ro^rn) | (wo^wn)) {
+ for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
+#define FDSETS_ARE_INT_ALIGNED
+#ifdef FDSETS_ARE_INT_ALIGNED
+
+#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
+#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
+ pr = (ro >> count) & 1;
+ pw = (wo >> count) & 1;
+ sr = (rn >> count) & 1;
+ sw = (wn >> count) & 1;
+#else
+ pr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&ro);
+ pw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wo);
+ sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
+ sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
+#endif
+#else
+ pr = FD_ISSET(fd, PrevReadEvent);
+ pw = FD_ISSET(fd, PrevWriteEvent);
+ sr = FD_ISSET(fd, StaticReadEvent);
+ sw = FD_ISSET(fd, StaticWriteEvent);
+#endif
+ if (!((sr^pr) | (sw^pw)))
+ continue;
+
+ ev.events = (sr ? EPOLLIN : 0) | (sw ? EPOLLOUT : 0);
+ ev.data.fd = fd;
+
+#ifdef EPOLL_CTL_MOD_WORKAROUND
+ /* I encountered a rarely reproducible problem with
+ * EPOLL_CTL_MOD where a modified FD (systematically
+ * the one in epoll_events[0], fd#7) would sometimes
+ * be set EPOLL_OUT while asked for a read ! This is
+ * with the 2.4 epoll patch. The workaround is to
+ * delete then recreate in case of modification.
+ * This is in 2.4 up to epoll-lt-0.21 but not in 2.6
+ * nor RHEL kernels.
+ */
+
+ if ((pr | pw) && fdtab[fd].state != FD_STCLOSE)
+ epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev);
+
+ if ((sr | sw))
+ epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev);
+#else
+ if ((pr | pw)) {
+ /* the file-descriptor already exists... */
+ if ((sr | sw)) {
+ /* ...and it will still exist */
+ if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, fd, &ev) < 0) {
+ // perror("epoll_ctl(MOD)");
+ // exit(1);
+ }
+ } else {
+ /* ...and it will be removed */
+ if (fdtab[fd].state != FD_STCLOSE &&
+ epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
+ // perror("epoll_ctl(DEL)");
+ // exit(1);
+ }
+ }
+ } else {
+ /* the file-descriptor did not exist, let's add it */
+ if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
+ // perror("epoll_ctl(ADD)");
+ // exit(1);
+ }
+ }
+#endif // EPOLL_CTL_MOD_WORKAROUND
+ }
+ ((int*)PrevReadEvent)[fds] = rn;
+ ((int*)PrevWriteEvent)[fds] = wn;
+ }
+ }
+
+ /* now let's wait for events */
+ status = epoll_wait(epoll_fd, epoll_events, maxfd, wait_time);
+ tv_now(&now);
+
+ for (count = 0; count < status; count++) {
+ fd = epoll_events[count].data.fd;
+
+ if (FD_ISSET(fd, StaticReadEvent)) {
+ if (fdtab[fd].state == FD_STCLOSE)
+ continue;
+ if (epoll_events[count].events & ( EPOLLIN | EPOLLERR | EPOLLHUP ))
+ fdtab[fd].cb[DIR_RD].f(fd);
+ }
+
+ if (FD_ISSET(fd, StaticWriteEvent)) {
+ if (fdtab[fd].state == FD_STCLOSE)
+ continue;
+ if (epoll_events[count].events & ( EPOLLOUT | EPOLLERR | EPOLLHUP ))
+ fdtab[fd].cb[DIR_WR].f(fd);
+ }
+ }
+}
+
+/*
+ * The only exported function. Returns 1.
+ */
+int epoll_register(struct poller *p)
+{
+ p->name = "epoll";
+ p->pref = 300;
+ p->private = NULL;
+
+ p->init = epoll_init;
+ p->term = epoll_term;
+ p->poll = epoll_poll;
+ p->isset = __fd_isset;
+ p->set = __fd_set;
+ p->clr = __fd_clr;
+ p->rem = __fd_rem;
+ p->clo = __fd_clo;
+ p->cond_s = __fd_cond_s;
+ p->cond_c = __fd_cond_c;
+ return 1;
+}
+
+
+/*
+ * Local variables:
+ * c-indent-level: 8
+ * c-basic-offset: 8
+ * End:
+ */
diff --git a/src/ev_poll.c b/src/ev_poll.c
new file mode 100644
index 0000000..2d6d984
--- /dev/null
+++ b/src/ev_poll.c
@@ -0,0 +1,264 @@
+/*
+ * FD polling functions for generic poll()
+ *
+ * Copyright 2000-2007 Willy Tarreau <w@1wt.eu>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#include <common/compat.h>
+#include <common/config.h>
+#include <common/time.h>
+
+#include <types/fd.h>
+#include <types/global.h>
+
+#include <proto/fd.h>
+#include <proto/polling.h>
+#include <proto/task.h>
+
+
+static fd_set *StaticReadEvent, *StaticWriteEvent;
+
+/* private data */
+static struct pollfd *poll_events = NULL;
+
+
+/*
+ * Benchmarks performed on a Pentium-M notebook show that using functions
+ * instead of the usual macros improve the FD_* performance by about 80%,
+ * and that marking them regparm(2) adds another 20%.
+ */
+REGPRM2 static int __fd_isset(const int fd, const int dir)
+{
+ fd_set *ev;
+ if (dir == DIR_RD)
+ ev = StaticReadEvent;
+ else
+ ev = StaticWriteEvent;
+
+ return FD_ISSET(fd, ev);
+}
+
+REGPRM2 static void __fd_set(const int fd, const int dir)
+{
+ fd_set *ev;
+ if (dir == DIR_RD)
+ ev = StaticReadEvent;
+ else
+ ev = StaticWriteEvent;
+
+ FD_SET(fd, ev);
+}
+
+REGPRM2 static void __fd_clr(const int fd, const int dir)
+{
+ fd_set *ev;
+ if (dir == DIR_RD)
+ ev = StaticReadEvent;
+ else
+ ev = StaticWriteEvent;
+
+ FD_CLR(fd, ev);
+}
+
+REGPRM2 static int __fd_cond_s(const int fd, const int dir)
+{
+ int ret;
+ fd_set *ev;
+ if (dir == DIR_RD)
+ ev = StaticReadEvent;
+ else
+ ev = StaticWriteEvent;
+
+ ret = !FD_ISSET(fd, ev);
+ if (ret)
+ FD_SET(fd, ev);
+ return ret;
+}
+
+REGPRM2 static int __fd_cond_c(const int fd, const int dir)
+{
+ int ret;
+ fd_set *ev;
+ if (dir == DIR_RD)
+ ev = StaticReadEvent;
+ else
+ ev = StaticWriteEvent;
+
+ ret = FD_ISSET(fd, ev);
+ if (ret)
+ FD_CLR(fd, ev);
+ return ret;
+}
+
+REGPRM1 static void __fd_rem(const int fd)
+{
+ FD_CLR(fd, StaticReadEvent);
+ FD_CLR(fd, StaticWriteEvent);
+}
+
+
+
+/*
+ * Initialization of the poll() poller.
+ * Returns 0 in case of failure, non-zero in case of success. If it fails, it
+ * disables the poller by setting its pref to 0.
+ */
+REGPRM1 static int poll_init(struct poller *p)
+{
+ __label__ fail_swevt, fail_srevt, fail_pe;
+ int fd_set_bytes;
+
+ p->private = NULL;
+ fd_set_bytes = sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE;
+
+ poll_events = (struct pollfd*)
+ calloc(1, sizeof(struct pollfd) * global.maxsock);
+
+ if (poll_events == NULL)
+ goto fail_pe;
+
+ if ((StaticReadEvent = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
+ goto fail_srevt;
+
+ if ((StaticWriteEvent = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
+ goto fail_swevt;
+
+ return 1;
+
+ fail_swevt:
+ free(StaticReadEvent);
+ fail_srevt:
+ free(poll_events);
+ fail_pe:
+ p->pref = 0;
+ return 0;
+}
+
+/*
+ * Termination of the poll() poller.
+ * Memory is released and the poller is marked as unselectable.
+ */
+REGPRM1 static void poll_term(struct poller *p)
+{
+ if (StaticWriteEvent)
+ free(StaticWriteEvent);
+ if (StaticReadEvent)
+ free(StaticReadEvent);
+ if (poll_events)
+ free(poll_events);
+ p->private = NULL;
+ p->pref = 0;
+}
+
+/*
+ * Poll() poller
+ */
+REGPRM2 static void poll_poll(struct poller *p, int wait_time)
+{
+ int status;
+ int fd, nbfd;
+
+ int fds, count;
+ int sr, sw;
+ unsigned rn, wn; /* read new, write new */
+
+ nbfd = 0;
+ for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
+
+ rn = ((int*)StaticReadEvent)[fds];
+ wn = ((int*)StaticWriteEvent)[fds];
+
+ if ((rn|wn)) {
+ for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
+#define FDSETS_ARE_INT_ALIGNED
+#ifdef FDSETS_ARE_INT_ALIGNED
+
+#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
+#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
+ sr = (rn >> count) & 1;
+ sw = (wn >> count) & 1;
+#else
+ sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
+ sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
+#endif
+#else
+ sr = FD_ISSET(fd, StaticReadEvent);
+ sw = FD_ISSET(fd, StaticWriteEvent);
+#endif
+ if ((sr|sw)) {
+ poll_events[nbfd].fd = fd;
+ poll_events[nbfd].events = (sr ? POLLIN : 0) | (sw ? POLLOUT : 0);
+ nbfd++;
+ }
+ }
+ }
+ }
+
+ /* now let's wait for events */
+ status = poll(poll_events, nbfd, wait_time);
+ tv_now(&now);
+
+ for (count = 0; status > 0 && count < nbfd; count++) {
+ fd = poll_events[count].fd;
+
+ if (!(poll_events[count].revents & ( POLLOUT | POLLIN | POLLERR | POLLHUP )))
+ continue;
+
+ /* ok, we found one active fd */
+ status--;
+
+ if (FD_ISSET(fd, StaticReadEvent)) {
+ if (fdtab[fd].state == FD_STCLOSE)
+ continue;
+ if (poll_events[count].revents & ( POLLIN | POLLERR | POLLHUP ))
+ fdtab[fd].cb[DIR_RD].f(fd);
+ }
+
+ if (FD_ISSET(fd, StaticWriteEvent)) {
+ if (fdtab[fd].state == FD_STCLOSE)
+ continue;
+ if (poll_events[count].revents & ( POLLOUT | POLLERR | POLLHUP ))
+ fdtab[fd].cb[DIR_WR].f(fd);
+ }
+ }
+
+}
+
+/*
+ * The only exported function. Returns 1.
+ */
+int poll_register(struct poller *p)
+{
+ p->name = "poll";
+ p->pref = 200;
+ p->private = NULL;
+
+ p->init = poll_init;
+ p->term = poll_term;
+ p->poll = poll_poll;
+ p->isset = __fd_isset;
+ p->set = __fd_set;
+ p->clr = __fd_clr;
+ p->clo = p->rem = __fd_rem;
+ p->cond_s = __fd_cond_s;
+ p->cond_c = __fd_cond_c;
+ return 1;
+}
+
+
+/*
+ * Local variables:
+ * c-indent-level: 8
+ * c-basic-offset: 8
+ * End:
+ */
diff --git a/src/ev_select.c b/src/ev_select.c
new file mode 100644
index 0000000..b1cd44e
--- /dev/null
+++ b/src/ev_select.c
@@ -0,0 +1,264 @@
+/*
+ * FD polling functions for generic select()
+ *
+ * Copyright 2000-2007 Willy Tarreau <w@1wt.eu>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#include <common/compat.h>
+#include <common/config.h>
+#include <common/time.h>
+
+#include <types/fd.h>
+#include <types/global.h>
+
+#include <proto/fd.h>
+#include <proto/polling.h>
+#include <proto/task.h>
+
+
+static fd_set *ReadEvent, *WriteEvent;
+static fd_set *StaticReadEvent, *StaticWriteEvent;
+
+
+/*
+ * Benchmarks performed on a Pentium-M notebook show that using functions
+ * instead of the usual macros improve the FD_* performance by about 80%,
+ * and that marking them regparm(2) adds another 20%.
+ */
+REGPRM2 static int __fd_isset(const int fd, const int dir)
+{
+ fd_set *ev;
+ if (dir == DIR_RD)
+ ev = StaticReadEvent;
+ else
+ ev = StaticWriteEvent;
+
+ return FD_ISSET(fd, ev);
+}
+
+REGPRM2 static void __fd_set(const int fd, const int dir)
+{
+ fd_set *ev;
+ if (dir == DIR_RD)
+ ev = StaticReadEvent;
+ else
+ ev = StaticWriteEvent;
+
+ FD_SET(fd, ev);
+}
+
+REGPRM2 static void __fd_clr(const int fd, const int dir)
+{
+ fd_set *ev;
+ if (dir == DIR_RD)
+ ev = StaticReadEvent;
+ else
+ ev = StaticWriteEvent;
+
+ FD_CLR(fd, ev);
+}
+
+REGPRM2 static int __fd_cond_s(const int fd, const int dir)
+{
+ int ret;
+ fd_set *ev;
+ if (dir == DIR_RD)
+ ev = StaticReadEvent;
+ else
+ ev = StaticWriteEvent;
+
+ ret = !FD_ISSET(fd, ev);
+ if (ret)
+ FD_SET(fd, ev);
+ return ret;
+}
+
+REGPRM2 static int __fd_cond_c(const int fd, const int dir)
+{
+ int ret;
+ fd_set *ev;
+ if (dir == DIR_RD)
+ ev = StaticReadEvent;
+ else
+ ev = StaticWriteEvent;
+
+ ret = FD_ISSET(fd, ev);
+ if (ret)
+ FD_CLR(fd, ev);
+ return ret;
+}
+
+REGPRM1 static void __fd_rem(const int fd)
+{
+ FD_CLR(fd, StaticReadEvent);
+ FD_CLR(fd, StaticWriteEvent);
+}
+
+
+/*
+ * Initialization of the select() poller.
+ * Returns 0 in case of failure, non-zero in case of success. If it fails, it
+ * disables the poller by setting its pref to 0.
+ */
+REGPRM1 static int select_init(struct poller *p)
+{
+ __label__ fail_swevt, fail_srevt, fail_wevt, fail_revt;
+ int fd_set_bytes;
+
+ p->private = NULL;
+ fd_set_bytes = sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE;
+
+ if ((ReadEvent = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
+ goto fail_revt;
+
+ if ((WriteEvent = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
+ goto fail_wevt;
+
+ if ((StaticReadEvent = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
+ goto fail_srevt;
+
+ if ((StaticWriteEvent = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
+ goto fail_swevt;
+
+ return 1;
+
+ fail_swevt:
+ free(StaticReadEvent);
+ fail_srevt:
+ free(WriteEvent);
+ fail_wevt:
+ free(ReadEvent);
+ fail_revt:
+ p->pref = 0;
+ return 0;
+}
+
+/*
+ * Termination of the select() poller.
+ * Memory is released and the poller is marked as unselectable.
+ */
+REGPRM1 static void select_term(struct poller *p)
+{
+ if (StaticWriteEvent)
+ free(StaticWriteEvent);
+ if (StaticReadEvent)
+ free(StaticReadEvent);
+ if (WriteEvent)
+ free(WriteEvent);
+ if (ReadEvent)
+ free(ReadEvent);
+ p->private = NULL;
+ p->pref = 0;
+}
+
+/*
+ * Select() poller
+ */
+REGPRM2 static void select_poll(struct poller *p, int wait_time)
+{
+ int status;
+ int fd, i;
+ struct timeval delta;
+ int readnotnull, writenotnull;
+ int fds;
+ char count;
+
+ /* allow select to return immediately when needed */
+ delta.tv_sec = delta.tv_usec = 0;
+ if (wait_time > 0) { /* FIXME */
+ /* Convert to timeval */
+ /* to avoid eventual select loops due to timer precision */
+ wait_time += SCHEDULER_RESOLUTION;
+ delta.tv_sec = wait_time / 1000;
+ delta.tv_usec = (wait_time % 1000) * 1000;
+ }
+
+ /* let's restore fdset state */
+
+ readnotnull = 0; writenotnull = 0;
+ for (i = 0; i < (maxfd + FD_SETSIZE - 1)/(8*sizeof(int)); i++) {
+ readnotnull |= (*(((int*)ReadEvent)+i) = *(((int*)StaticReadEvent)+i)) != 0;
+ writenotnull |= (*(((int*)WriteEvent)+i) = *(((int*)StaticWriteEvent)+i)) != 0;
+ }
+
+ // /* just a verification code, needs to be removed for performance */
+ // for (i=0; i<maxfd; i++) {
+ // if (FD_ISSET(i, ReadEvent) != FD_ISSET(i, StaticReadEvent))
+ // abort();
+ // if (FD_ISSET(i, WriteEvent) != FD_ISSET(i, StaticWriteEvent))
+ // abort();
+ //
+ // }
+
+ status = select(maxfd,
+ readnotnull ? ReadEvent : NULL,
+ writenotnull ? WriteEvent : NULL,
+ NULL,
+ (wait_time >= 0) ? &delta : NULL);
+
+ tv_now(&now);
+
+ if (status <= 0)
+ return;
+
+ for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
+ if ((((int *)(ReadEvent))[fds] | ((int *)(WriteEvent))[fds]) == 0)
+ continue;
+
+ for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
+ /* if we specify read first, the accepts and zero reads will be
+ * seen first. Moreover, system buffers will be flushed faster.
+ */
+ if (FD_ISSET(fd, ReadEvent)) {
+ if (fdtab[fd].state == FD_STCLOSE)
+ continue;
+ fdtab[fd].cb[DIR_RD].f(fd);
+ }
+
+ if (FD_ISSET(fd, WriteEvent)) {
+ if (fdtab[fd].state == FD_STCLOSE)
+ continue;
+ fdtab[fd].cb[DIR_WR].f(fd);
+ }
+ }
+ }
+}
+
+/*
+ * The only exported function. Returns 1.
+ */
+int select_register(struct poller *p)
+{
+ p->name = "select";
+ p->pref = 150;
+ p->private = NULL;
+
+ p->init = select_init;
+ p->term = select_term;
+ p->poll = select_poll;
+ p->isset = __fd_isset;
+ p->set = __fd_set;
+ p->clr = __fd_clr;
+ p->clo = p->rem = __fd_rem;
+ p->cond_s = __fd_cond_s;
+ p->cond_c = __fd_cond_c;
+ return 1;
+}
+
+
+/*
+ * Local variables:
+ * c-indent-level: 8
+ * c-basic-offset: 8
+ * End:
+ */
diff --git a/src/fd.c b/src/fd.c
index 5fd3c27..18ccb24 100644
--- a/src/fd.c
+++ b/src/fd.c
@@ -1,7 +1,7 @@
/*
* File descriptors management functions.
*
- * Copyright 2000-2006 Willy Tarreau <w@1wt.eu>
+ * Copyright 2000-2007 Willy Tarreau <w@1wt.eu>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -10,509 +10,110 @@
*
*/
-/*
- * FIXME:
- * - we still use 'listeners' to check whether we want to stop or not.
- * - the various pollers should be moved to other external files, possibly
- * dynamic libs.
- */
-
+#include <string.h>
#include <unistd.h>
-#include <sys/time.h>
#include <sys/types.h>
#include <common/compat.h>
#include <common/config.h>
-#include <common/time.h>
#include <types/fd.h>
#include <types/global.h>
#include <proto/fd.h>
-#include <proto/polling.h>
-#include <proto/task.h>
struct fdtab *fdtab = NULL; /* array of all the file descriptors */
int maxfd; /* # of the highest fd + 1 */
int totalconn; /* total # of terminated sessions */
int actconn; /* # of active sessions */
-fd_set *StaticReadEvent, *StaticWriteEvent;
int cfg_polling_mechanism = 0; /* POLL_USE_{SELECT|POLL|EPOLL} */
-
-/******************************
- * pollers
- ******************************/
+struct poller pollers[MAX_POLLERS];
+struct poller cur_poller;
+int nbpollers = 0;
-#if !defined(CONFIG_HAP_INLINE_FD_SET)
-/*
- * Benchmarks performed on a Pentium-M notebook show that using functions
- * instead of the usual macros improve the FD_* performance by about 80%,
- * and that marking them regparm(2) adds another 20%.
- */
-REGPRM2 void my_fd_set(const int fd, fd_set *ev)
-{
- FD_SET(fd, ev);
-}
-
-REGPRM2 void my_fd_clr(const int fd, fd_set *ev)
-{
- FD_CLR(fd, ev);
-}
+/*********************
+ * generic functions
+ *********************/
-REGPRM2 int my_fd_isset(const int fd, const fd_set *ev)
-{
- return FD_ISSET(fd, ev);
-}
+extern int select_register(struct poller *p);
+#if defined(ENABLE_POLL)
+extern int poll_register(struct poller *p);
#endif
-
-
-/*
- * FIXME: this is dirty, but at the moment, there's no other solution to remove
- * the old FDs from outside the loop. Perhaps we should export a global 'poll'
- * structure with pointers to functions such as init_fd() and close_fd(), plus
- * a private structure with several pointers to places such as below.
- */
-
#if defined(ENABLE_EPOLL)
-fd_set *PrevReadEvent = NULL, *PrevWriteEvent = NULL;
-
-#if defined(USE_MY_EPOLL)
-#include <errno.h>
-#include <sys/syscall.h>
-_syscall1 (int, epoll_create, int, size);
-_syscall4 (int, epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event *, event);
-_syscall4 (int, epoll_wait, int, epfd, struct epoll_event *, events, int, maxevents, int, timeout);
+extern int epoll_register(struct poller *p);
#endif
-/*
- * Main epoll() loop.
- * does 3 actions :
- * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures
- * 1 (POLL_LOOP_ACTION_RUN) : runs the loop
- * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up
- *
- * returns 0 if initialization failed, !0 otherwise.
- */
-int epoll_loop(int action)
+/* Deletes an FD from the fdsets, and recomputes the maxfd limit.
+ * The file descriptor is also closed.
+ */
+void fd_delete(int fd)
{
- int next_time;
- int status;
- int fd;
-
- int fds, count;
- int pr, pw, sr, sw;
- unsigned rn, ro, wn, wo; /* read new, read old, write new, write old */
- struct epoll_event ev;
-
- /* private data */
- static struct epoll_event *epoll_events = NULL;
- static int epoll_fd;
-
- if (action == POLL_LOOP_ACTION_INIT) {
- epoll_fd = epoll_create(global.maxsock + 1);
- if (epoll_fd < 0)
- return 0;
- else {
- epoll_events = (struct epoll_event*)
- calloc(1, sizeof(struct epoll_event) * global.maxsock);
- PrevReadEvent = (fd_set *)
- calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
- PrevWriteEvent = (fd_set *)
- calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
- }
- return 1;
- }
- else if (action == POLL_LOOP_ACTION_CLEAN) {
- if (PrevWriteEvent) free(PrevWriteEvent);
- if (PrevReadEvent) free(PrevReadEvent);
- if (epoll_events) free(epoll_events);
- close(epoll_fd);
- epoll_fd = 0;
- return 1;
- }
-
- /* OK, it's POLL_LOOP_ACTION_RUN */
-
- tv_now(&now);
-
- while (1) {
- next_time = process_runnable_tasks();
-
- /* stop when there's no connection left and we don't allow them anymore */
- if (!actconn && listeners == 0)
- break;
-
- for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
-
- rn = ((int*)StaticReadEvent)[fds]; ro = ((int*)PrevReadEvent)[fds];
- wn = ((int*)StaticWriteEvent)[fds]; wo = ((int*)PrevWriteEvent)[fds];
-
- if ((ro^rn) | (wo^wn)) {
- for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
-#define FDSETS_ARE_INT_ALIGNED
-#ifdef FDSETS_ARE_INT_ALIGNED
-
-#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
-#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
- pr = (ro >> count) & 1;
- pw = (wo >> count) & 1;
- sr = (rn >> count) & 1;
- sw = (wn >> count) & 1;
-#else
- pr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&ro);
- pw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wo);
- sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
- sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
-#endif
-#else
- pr = FD_ISSET(fd, PrevReadEvent);
- pw = FD_ISSET(fd, PrevWriteEvent);
- sr = FD_ISSET(fd, StaticReadEvent);
- sw = FD_ISSET(fd, StaticWriteEvent);
-#endif
- if (!((sr^pr) | (sw^pw)))
- continue;
-
- ev.events = (sr ? EPOLLIN : 0) | (sw ? EPOLLOUT : 0);
- ev.data.fd = fd;
-
-#ifdef EPOLL_CTL_MOD_WORKAROUND
- /* I encountered a rarely reproducible problem with
- * EPOLL_CTL_MOD where a modified FD (systematically
- * the one in epoll_events[0], fd#7) would sometimes
- * be set EPOLL_OUT while asked for a read ! This is
- * with the 2.4 epoll patch. The workaround is to
- * delete then recreate in case of modification.
- * This is in 2.4 up to epoll-lt-0.21 but not in 2.6
- * nor RHEL kernels.
- */
-
- if ((pr | pw) && fdtab[fd].state != FD_STCLOSE)
- epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev);
-
- if ((sr | sw))
- epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev);
-#else
- if ((pr | pw)) {
- /* the file-descriptor already exists... */
- if ((sr | sw)) {
- /* ...and it will still exist */
- if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, fd, &ev) < 0) {
- // perror("epoll_ctl(MOD)");
- // exit(1);
- }
- } else {
- /* ...and it will be removed */
- if (fdtab[fd].state != FD_STCLOSE &&
- epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
- // perror("epoll_ctl(DEL)");
- // exit(1);
- }
- }
- } else {
- /* the file-descriptor did not exist, let's add it */
- if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
- // perror("epoll_ctl(ADD)");
- // exit(1);
- }
- }
-#endif // EPOLL_CTL_MOD_WORKAROUND
- }
- ((int*)PrevReadEvent)[fds] = rn;
- ((int*)PrevWriteEvent)[fds] = wn;
- }
- }
-
- /* now let's wait for events */
- status = epoll_wait(epoll_fd, epoll_events, maxfd, next_time);
- tv_now(&now);
-
- for (count = 0; count < status; count++) {
- fd = epoll_events[count].data.fd;
-
- if (FD_ISSET(fd, StaticReadEvent)) {
- if (fdtab[fd].state == FD_STCLOSE)
- continue;
- if (epoll_events[count].events & ( EPOLLIN | EPOLLERR | EPOLLHUP ))
- fdtab[fd].cb[DIR_RD].f(fd);
- }
+ EV_FD_CLO(fd);
+ close(fd);
+ fdtab[fd].state = FD_STCLOSE;
- if (FD_ISSET(fd, StaticWriteEvent)) {
- if (fdtab[fd].state == FD_STCLOSE)
- continue;
- if (epoll_events[count].events & ( EPOLLOUT | EPOLLERR | EPOLLHUP ))
- fdtab[fd].cb[DIR_WR].f(fd);
- }
- }
- }
- return 1;
+ while ((maxfd-1 >= 0) && (fdtab[maxfd-1].state == FD_STCLOSE))
+ maxfd--;
}
-#endif
-
-#if defined(ENABLE_POLL)
-/*
- * Main poll() loop.
- * does 3 actions :
- * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures
- * 1 (POLL_LOOP_ACTION_RUN) : runs the loop
- * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up
- *
- * returns 0 if initialization failed, !0 otherwise.
- */
-
-int poll_loop(int action)
+/* registers all known pollers */
+void register_pollers()
{
- int next_time;
- int status;
- int fd, nbfd;
-
- int fds, count;
- int sr, sw;
- unsigned rn, wn; /* read new, write new */
-
- /* private data */
- static struct pollfd *poll_events = NULL;
-
- if (action == POLL_LOOP_ACTION_INIT) {
- poll_events = (struct pollfd*)
- calloc(1, sizeof(struct pollfd) * global.maxsock);
- return 1;
- }
- else if (action == POLL_LOOP_ACTION_CLEAN) {
- if (poll_events)
- free(poll_events);
- return 1;
- }
-
- /* OK, it's POLL_LOOP_ACTION_RUN */
-
- tv_now(&now);
-
- while (1) {
- next_time = process_runnable_tasks();
-
- /* stop when there's no connection left and we don't allow them anymore */
- if (!actconn && listeners == 0)
- break;
-
- nbfd = 0;
- for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
-
- rn = ((int*)StaticReadEvent)[fds];
- wn = ((int*)StaticWriteEvent)[fds];
-
- if ((rn|wn)) {
- for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
-#define FDSETS_ARE_INT_ALIGNED
-#ifdef FDSETS_ARE_INT_ALIGNED
-
-#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
-#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
- sr = (rn >> count) & 1;
- sw = (wn >> count) & 1;
-#else
- sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
- sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
-#endif
-#else
- sr = FD_ISSET(fd, StaticReadEvent);
- sw = FD_ISSET(fd, StaticWriteEvent);
+ if (select_register(&pollers[nbpollers]))
+ nbpollers++;
+#if defined(ENABLE_POLL)
+ poll_register(&pollers[nbpollers]);
+ nbpollers++;
#endif
- if ((sr|sw)) {
- poll_events[nbfd].fd = fd;
- poll_events[nbfd].events = (sr ? POLLIN : 0) | (sw ? POLLOUT : 0);
- nbfd++;
- }
- }
- }
- }
-
- /* now let's wait for events */
- status = poll(poll_events, nbfd, next_time);
- tv_now(&now);
-
- for (count = 0; status > 0 && count < nbfd; count++) {
- fd = poll_events[count].fd;
-
- if (!(poll_events[count].revents & ( POLLOUT | POLLIN | POLLERR | POLLHUP )))
- continue;
- /* ok, we found one active fd */
- status--;
-
- if (FD_ISSET(fd, StaticReadEvent)) {
- if (fdtab[fd].state == FD_STCLOSE)
- continue;
- if (poll_events[count].revents & ( POLLIN | POLLERR | POLLHUP ))
- fdtab[fd].cb[DIR_RD].f(fd);
- }
-
- if (FD_ISSET(fd, StaticWriteEvent)) {
- if (fdtab[fd].state == FD_STCLOSE)
- continue;
- if (poll_events[count].revents & ( POLLOUT | POLLERR | POLLHUP ))
- fdtab[fd].cb[DIR_WR].f(fd);
- }
- }
- }
- return 1;
-}
+#if defined(ENABLE_EPOLL)
+ epoll_register(&pollers[nbpollers]);
+ nbpollers++;
#endif
+}
+/* disable the specified poller */
+void disable_poller(const char *poller_name)
+{
+ int p;
+ for (p = 0; p < nbpollers; p++)
+ if (strcmp(pollers[p].name, poller_name) == 0)
+ pollers[p].pref = 0;
+}
/*
- * Main select() loop.
- * does 3 actions :
- * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures
- * 1 (POLL_LOOP_ACTION_RUN) : runs the loop
- * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up
- *
- * returns 0 if initialization failed, !0 otherwise.
+ * Initialize the pollers till the best one is found.
+ * If none works, returns 0, otherwise 1.
*/
-
-
-int select_loop(int action)
+int init_pollers()
{
- int next_time;
- int status;
- int fd,i;
- struct timeval delta;
- int readnotnull, writenotnull;
- static fd_set *ReadEvent = NULL, *WriteEvent = NULL;
+ int p;
+ struct poller *bp;
- if (action == POLL_LOOP_ACTION_INIT) {
- ReadEvent = (fd_set *)
- calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
- WriteEvent = (fd_set *)
- calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
- return 1;
- }
- else if (action == POLL_LOOP_ACTION_CLEAN) {
- if (WriteEvent) free(WriteEvent);
- if (ReadEvent) free(ReadEvent);
- return 1;
- }
- /* OK, it's POLL_LOOP_ACTION_RUN */
+ do {
+ bp = NULL;
+ for (p = 0; p < nbpollers; p++)
+ if (!bp || (pollers[p].pref > bp->pref))
+ bp = &pollers[p];
- tv_now(&now);
-
- while (1) {
- next_time = process_runnable_tasks();
-
- /* stop when there's no connection left and we don't allow them anymore */
- if (!actconn && listeners == 0)
+ if (!bp || bp->pref == 0)
break;
- if (next_time > 0) { /* FIXME */
- /* Convert to timeval */
- /* to avoid eventual select loops due to timer precision */
- next_time += SCHEDULER_RESOLUTION;
- delta.tv_sec = next_time / 1000;
- delta.tv_usec = (next_time % 1000) * 1000;
- }
- else if (next_time == 0) { /* allow select to return immediately when needed */
- delta.tv_sec = delta.tv_usec = 0;
- }
-
-
- /* let's restore fdset state */
-
- readnotnull = 0; writenotnull = 0;
- for (i = 0; i < (maxfd + FD_SETSIZE - 1)/(8*sizeof(int)); i++) {
- readnotnull |= (*(((int*)ReadEvent)+i) = *(((int*)StaticReadEvent)+i)) != 0;
- writenotnull |= (*(((int*)WriteEvent)+i) = *(((int*)StaticWriteEvent)+i)) != 0;
+ if (bp->init(bp)) {
+ memcpy(&cur_poller, bp, sizeof(*bp));
+ return 1;
}
-
- // /* just a verification code, needs to be removed for performance */
- // for (i=0; i<maxfd; i++) {
- // if (FD_ISSET(i, ReadEvent) != FD_ISSET(i, StaticReadEvent))
- // abort();
- // if (FD_ISSET(i, WriteEvent) != FD_ISSET(i, StaticWriteEvent))
- // abort();
- //
- // }
-
- status = select(maxfd,
- readnotnull ? ReadEvent : NULL,
- writenotnull ? WriteEvent : NULL,
- NULL,
- (next_time >= 0) ? &delta : NULL);
-
- /* this is an experiment on the separation of the select work */
- // status = (readnotnull ? select(maxfd, ReadEvent, NULL, NULL, (next_time >= 0) ? &delta : NULL) : 0);
- // status |= (writenotnull ? select(maxfd, NULL, WriteEvent, NULL, (next_time >= 0) ? &delta : NULL) : 0);
-
- tv_now(&now);
-
- if (status > 0) { /* must proceed with events */
-
- int fds;
- char count;
-
- for (fds = 0; (fds << INTBITS) < maxfd; fds++)
- if ((((int *)(ReadEvent))[fds] | ((int *)(WriteEvent))[fds]) != 0)
- for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
-
- /* if we specify read first, the accepts and zero reads will be
- * seen first. Moreover, system buffers will be flushed faster.
- */
- if (FD_ISSET(fd, ReadEvent)) {
- if (fdtab[fd].state == FD_STCLOSE)
- continue;
- fdtab[fd].cb[DIR_RD].f(fd);
- }
-
- if (FD_ISSET(fd, WriteEvent)) {
- if (fdtab[fd].state == FD_STCLOSE)
- continue;
- fdtab[fd].cb[DIR_WR].f(fd);
- }
- }
- }
- else {
- // fprintf(stderr,"select returned %d, maxfd=%d\n", status, maxfd);
- }
- }
- return 1;
-}
-
-
-
-/*********************
- * generic functions
- *********************/
-
-
-/* Deletes an FD from the fdsets, and recomputes the maxfd limit.
- * The file descriptor is also closed.
- */
-void fd_delete(int fd)
-{
- MY_FD_CLR(fd, StaticReadEvent);
- MY_FD_CLR(fd, StaticWriteEvent);
-#if defined(ENABLE_EPOLL)
- if (PrevReadEvent) {
- MY_FD_CLR(fd, PrevReadEvent);
- MY_FD_CLR(fd, PrevWriteEvent);
- }
-#endif
-
- close(fd);
- fdtab[fd].state = FD_STCLOSE;
-
- while ((maxfd-1 >= 0) && (fdtab[maxfd-1].state == FD_STCLOSE))
- maxfd--;
+ } while (!bp || bp->pref == 0);
+ return 0;
}
-
/*
* Local variables:
* c-indent-level: 8
diff --git a/src/haproxy.c b/src/haproxy.c
index 5b3ade2..7b23e2f 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -507,18 +507,34 @@
if (global.nbproc < 1)
global.nbproc = 1;
- StaticReadEvent = (fd_set *)calloc(1,
- sizeof(fd_set) *
- (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
- StaticWriteEvent = (fd_set *)calloc(1,
- sizeof(fd_set) *
- (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
-
fdtab = (struct fdtab *)calloc(1,
sizeof(struct fdtab) * (global.maxsock));
for (i = 0; i < global.maxsock; i++) {
fdtab[i].state = FD_STCLOSE;
}
+
+ register_pollers();
+ /* Note: we could register external pollers here */
+
+ if (!(cfg_polling_mechanism & POLL_USE_EPOLL))
+ disable_poller("epoll");
+
+ if (!(cfg_polling_mechanism & POLL_USE_POLL))
+ disable_poller("poll");
+
+ if (!(cfg_polling_mechanism & POLL_USE_SELECT))
+ disable_poller("select");
+
+ /* Note: we could disable any poller by name here */
+
+ if (!init_pollers()) {
+ Alert("No polling mechanism available\n");
+ exit(1);
+ }
+ if (global.mode & MODE_DEBUG) {
+ printf("Note: using %s() as the polling mechanism.\n", cur_poller.name);
+ }
+
}
void deinit(void)
@@ -603,8 +619,6 @@
if (global.chroot) free(global.chroot);
if (global.pidfile) free(global.pidfile);
- if (StaticReadEvent) free(StaticReadEvent);
- if (StaticWriteEvent) free(StaticWriteEvent);
if (fdtab) free(fdtab);
pool_destroy(pool_session);
@@ -628,6 +642,30 @@
kill(oldpids[p], sig);
}
+/*
+ * Runs the polling loop
+ *
+ * FIXME:
+ * - we still use 'listeners' to check whether we want to stop or not.
+ *
+ */
+void run_poll_loop()
+{
+ int next_time;
+ tv_now(&now);
+
+ while (1) {
+ next_time = process_runnable_tasks();
+
+ /* stop when there's no connection left and we don't allow them anymore */
+ if (!actconn && listeners == 0)
+ break;
+
+ cur_poller.poll(&cur_poller, next_time);
+ }
+}
+
+
int main(int argc, char **argv)
{
int err, retry;
@@ -860,41 +898,10 @@
setsid();
}
-#if defined(ENABLE_EPOLL)
- if (cfg_polling_mechanism & POLL_USE_EPOLL) {
- if (epoll_loop(POLL_LOOP_ACTION_INIT)) {
- epoll_loop(POLL_LOOP_ACTION_RUN);
- epoll_loop(POLL_LOOP_ACTION_CLEAN);
- cfg_polling_mechanism &= POLL_USE_EPOLL;
- }
- else {
- Warning("epoll() is not available. Using poll()/select() instead.\n");
- cfg_polling_mechanism &= ~POLL_USE_EPOLL;
- }
- }
-#endif
-
-#if defined(ENABLE_POLL)
- if (cfg_polling_mechanism & POLL_USE_POLL) {
- if (poll_loop(POLL_LOOP_ACTION_INIT)) {
- poll_loop(POLL_LOOP_ACTION_RUN);
- poll_loop(POLL_LOOP_ACTION_CLEAN);
- cfg_polling_mechanism &= POLL_USE_POLL;
- }
- else {
- Warning("poll() is not available. Using select() instead.\n");
- cfg_polling_mechanism &= ~POLL_USE_POLL;
- }
- }
-#endif
- if (cfg_polling_mechanism & POLL_USE_SELECT) {
- if (select_loop(POLL_LOOP_ACTION_INIT)) {
- select_loop(POLL_LOOP_ACTION_RUN);
- select_loop(POLL_LOOP_ACTION_CLEAN);
- cfg_polling_mechanism &= POLL_USE_SELECT;
- }
- }
-
+ /*
+ * That's it : the central polling loop. Run until we stop.
+ */
+ run_poll_loop();
/* Free all Hash Keys and all Hash elements */
appsession_cleanup();