* released 1.2.5-pre3
* experimental epoll() support (use temporary '-e' argument)
diff --git a/CHANGELOG b/CHANGELOG
index f0af36f..fc0b5ab 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,7 +1,10 @@
ChangeLog :
===========
-2005/04/24
+2005/04/25 : 1.2.5-pre3
+ - experimental epoll() support (use temporary '-e' argument)
+
+2005/04/24 : 1.2.5-pre2
- implemented the HTTP 303 code for error redirection. This forces the
browser to fetch the given URI with a GET request. The new keyword for
this is 'errorloc303', and a new 'errorloc302' keyword has been created
@@ -9,7 +12,7 @@
- added more controls in the parser for valid use of '\x' sequence.
- few fixes from Alex & Klaus
-2005/02/17
+2005/02/17 : 1.2.5-pre1
- fixed a few errors in the documentation
2005/02/13
diff --git a/Makefile b/Makefile
index 29a79fc..70a21fe 100644
--- a/Makefile
+++ b/Makefile
@@ -73,8 +73,8 @@
REGEX_OPTS=$(COPTS.$(REGEX))
CPU_OPTS=$(COPTS.$(CPU))
-COPTS=-I. $(CPU_OPTS) $(TARGET_OPTS) $(REGEX_OPTS) $(SMALL_OPTS)
-LIBS=$(LIBS.$(TARGET)) $(LIBS.$(REGEX))
+COPTS=-I. $(ADDINC) $(CPU_OPTS) $(TARGET_OPTS) $(REGEX_OPTS) $(SMALL_OPTS)
+LIBS=$(LIBS.$(TARGET)) $(LIBS.$(REGEX)) $(ADDLIB)
# - use -DSTATTIME=0 to disable statistics, else specify an interval in
# milliseconds.
diff --git a/TODO b/TODO
index 047468b..66a4ec6 100644
--- a/TODO
+++ b/TODO
@@ -139,4 +139,5 @@
* appcookie
- weighted round robin
- option to shutdown(listen_sock) when max connections reached
-
+* epoll
+- replace the event scheduler with an O(log(N)) one
diff --git a/haproxy.c b/haproxy.c
index c5c2917..79819e6 100644
--- a/haproxy.c
+++ b/haproxy.c
@@ -62,6 +62,8 @@
#include <strings.h>
#endif
+#include <sys/epoll.h>
+
#include "include/appsession.h"
#define HAPROXY_VERSION "1.2.5"
@@ -575,6 +577,7 @@
/*********************************************************************/
int cfg_maxpconn = 2000; /* # of simultaneous connections per proxy (-N) */
+int cfg_use_epoll = 0; /* use epoll() instead of select() ? */
char *cfg_cfgfile = NULL; /* configuration file */
char *progname = NULL; /* program name */
int pid; /* current process id */
@@ -607,6 +610,11 @@
*StaticReadEvent,
*StaticWriteEvent;
+/* used by the epoll() emulation of select() */
+fd_set *PrevReadEvent, *PrevWriteEvent;
+struct epoll_event *epoll_events;
+int epoll_fd;
+
void **pool_session = NULL,
**pool_buffer = NULL,
**pool_fdtab = NULL,
@@ -803,7 +811,9 @@
" -c check mode : only check config file and exit\n"
" -n sets the maximum total # of connections (%d)\n"
" -N sets the default, per-proxy maximum # of connections (%d)\n"
- " -p writes pids of all children to this file\n\n",
+ " -p writes pids of all children to this file\n"
+ " -e tries to use epoll() instead of select()\n"
+ "\n",
name, DEFAULT_MAXCONN, cfg_maxpconn);
exit(1);
}
@@ -1384,6 +1394,20 @@
static inline void fd_delete(int fd) {
FD_CLR(fd, StaticReadEvent);
FD_CLR(fd, StaticWriteEvent);
+ if (cfg_use_epoll) {
+ struct epoll_event ev;
+
+ ev.data.fd = fd;
+ if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
+ // it's impossible to tell whether it has already
+ // been done.
+ //perror("epoll_ctl(DEL)");
+ //exit(1);
+ }
+
+ FD_CLR(fd, PrevReadEvent);
+ FD_CLR(fd, PrevWriteEvent);
+ }
close(fd);
fdtab[fd].state = FD_STCLOSE;
@@ -2130,6 +2154,20 @@
void client_retnclose(struct session *s, int len, const char *msg) {
FD_CLR(s->cli_fd, StaticReadEvent);
FD_SET(s->cli_fd, StaticWriteEvent);
+ if (cfg_use_epoll) {
+ struct epoll_event ev;
+
+ ev.data.fd = s->cli_fd;
+ if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, s->cli_fd, &ev) < 0) {
+ // it's impossible to tell whether it has already
+ // been done.
+ //perror("epoll_ctl(DEL)");
+ //exit(1);
+ }
+
+ FD_CLR(s->cli_fd, PrevReadEvent);
+ FD_CLR(s->cli_fd, PrevWriteEvent);
+ }
tv_eternity(&s->crexpire);
shutdown(s->cli_fd, SHUT_RD);
s->cli_state = CL_STSHUTR;
@@ -4832,71 +4870,232 @@
next_time = MINTIME(time2, next_time);
#endif
- if (next_time > 0) { /* FIXME */
- /* Convert to timeval */
- /* to avoid eventual select loops due to timer precision */
- next_time += SCHEDULER_RESOLUTION;
- delta.tv_sec = next_time / 1000;
- delta.tv_usec = (next_time % 1000) * 1000;
- }
- else if (next_time == 0) { /* allow select to return immediately when needed */
- delta.tv_sec = delta.tv_usec = 0;
- }
+ if (cfg_use_epoll) {
+ /* use epoll() */
+ int fds, count;
+ int pr, pw, sr, sw;
+ unsigned rn, ro, wn, wo; /* read new, read old, write new, write old */
+ struct epoll_event ev;
- /* let's restore fdset state */
+ for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
- readnotnull = 0; writenotnull = 0;
- for (i = 0; i < (maxfd + FD_SETSIZE - 1)/(8*sizeof(int)); i++) {
- readnotnull |= (*(((int*)ReadEvent)+i) = *(((int*)StaticReadEvent)+i)) != 0;
- writenotnull |= (*(((int*)WriteEvent)+i) = *(((int*)StaticWriteEvent)+i)) != 0;
- }
-
-// /* just a verification code, needs to be removed for performance */
-// for (i=0; i<maxfd; i++) {
-// if (FD_ISSET(i, ReadEvent) != FD_ISSET(i, StaticReadEvent))
-// abort();
-// if (FD_ISSET(i, WriteEvent) != FD_ISSET(i, StaticWriteEvent))
-// abort();
-//
-// }
+ rn = ((int*)StaticReadEvent)[fds]; ro = ((int*)PrevReadEvent)[fds];
+ wn = ((int*)StaticWriteEvent)[fds]; wo = ((int*)PrevWriteEvent)[fds];
- status = select(maxfd,
- readnotnull ? ReadEvent : NULL,
- writenotnull ? WriteEvent : NULL,
- NULL,
- (next_time >= 0) ? &delta : NULL);
-
- /* this is an experiment on the separation of the select work */
- // status = (readnotnull ? select(maxfd, ReadEvent, NULL, NULL, (next_time >= 0) ? &delta : NULL) : 0);
- // status |= (writenotnull ? select(maxfd, NULL, WriteEvent, NULL, (next_time >= 0) ? &delta : NULL) : 0);
-
- tv_now(&now);
+ if ((ro^rn) | (wo^wn)) {
+ for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
- if (status > 0) { /* must proceed with events */
+#define WE_KNOW_HOW_FDSET_WORKS
+#ifdef WE_KNOW_HOW_FDSET_WORKS
- int fds;
- char count;
-
- for (fds = 0; (fds << INTBITS) < maxfd; fds++)
- if ((((int *)(ReadEvent))[fds] | ((int *)(WriteEvent))[fds]) != 0)
- for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
-
- /* if we specify read first, the accepts and zero reads will be
- * seen first. Moreover, system buffers will be flushed faster.
- */
- if (fdtab[fd].state == FD_STCLOSE)
+#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
+#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
+ pr = (ro >> ((1<<INTBITS)-count)) & 1;
+ pw = (wo >> ((1<<INTBITS)-count)) & 1;
+ sr = (rn >> ((1<<INTBITS)-count)) & 1;
+ sw = (wn >> ((1<<INTBITS)-count)) & 1;
+#else
+ pr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&ro);
+ pw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wo);
+ sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
+ sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
+#endif
+#else
+ pr = FD_ISSET(fd, PrevReadEvent);
+ pw = FD_ISSET(fd, PrevWriteEvent);
+ sr = FD_ISSET(fd, StaticReadEvent);
+ sw = FD_ISSET(fd, StaticWriteEvent);
+#endif
+ if (!((sr^pr) | (sw^pw)))
continue;
-
- if (FD_ISSET(fd, ReadEvent))
- fdtab[fd].read(fd);
+
+ ev.events = (sr ? EPOLLIN : 0) | (sw ? EPOLLOUT : 0);
+ ev.data.fd = fd;
+
+ if ((pr | pw)) {
+ /* the file-descriptor already exists... */
+ if ((sr | sw)) {
+ /* ...and it will still exist */
+ if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, fd, &ev) < 0) {
+ perror("epoll_ctl(MOD)");
+ exit(1);
+ }
+ } else {
+ /* ...and it will be removed */
+ if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
+ perror("epoll_ctl(DEL)");
+ exit(1);
+ }
+ }
+ } else {
+ /* the file-descriptor did not exist, let's add it */
+ if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
+ perror("epoll_ctl(ADD)");
+ exit(1);
+ }
+ }
+ }
+ ((int*)PrevReadEvent)[fds] = rn;
+ ((int*)PrevWriteEvent)[fds] = wn;
+ }
+
+#if useless_optimization
+ unsigned a, d, m; /* add mask, del mask, mod mask */
+
+ a = (rn|wn) & ~(ro|wo); /* fds to add */
+ d = (ro|wo) & ~(rn|wn); /* fds to remove, normally none */
+ m = (ro^rn) | (wo^wn); /* fds which change */
+
+ if (m) {
+ struct epoll_event ev;
+ m &= ~(a|d); /* keep only changes, not add/del */
- if (FD_ISSET(fd, WriteEvent))
- fdtab[fd].write(fd);
+ if (m) { /* fds which only change */
+ for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
+ ev.data.fd = fd;
+ ev.events = 0;
+
+ if ((FD_ISSET(fd, PrevReadEvent) || FD_ISSET(fd, PrevWriteEvent)) &&
+ (FD_ISSET(fd, StaticReadEvent) || FD_ISSET(fd, StaticWriteEvent))) {
+ if (FD_ISSET(fd, StaticReadEvent))
+ ev.events |= EPOLLIN;
+ if (FD_ISSET(fd, StaticWriteEvent))
+ ev.events |= EPOLLOUT;
+ if (ev.events && epoll_ctl(epoll_fd, EPOLL_CTL_MOD, fd, &ev) < 0) {
+ perror("epoll_ctl(MOD)");
+ exit(1);
+ }
+ }
+ }
}
- }
- else {
- // fprintf(stderr,"select returned %d, maxfd=%d\n", status, maxfd);
+
+ if (a) { /* fds to add */
+ // printf("a=%08x\n", a);
+ for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
+ ev.data.fd = fd;
+ ev.events = 0;
+ if (!FD_ISSET(fd, PrevReadEvent) && !FD_ISSET(fd, PrevWriteEvent)) {
+ if (FD_ISSET(fd, StaticReadEvent))
+ ev.events |= EPOLLIN;
+ if (FD_ISSET(fd, StaticWriteEvent))
+ ev.events |= EPOLLOUT;
+ if (ev.events && epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
+ perror("epoll_ctl(ADD)");
+ exit(1);
+ }
+ }
+ }
+ }
+
+ if (d) { /* fds to delete */
+ for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
+ ev.data.fd = fd;
+ ev.events = 0;
+ if (FD_ISSET(fd, StaticReadEvent) || FD_ISSET(fd, StaticWriteEvent))
+ continue;
+ if (!FD_ISSET(fd, PrevReadEvent) && !FD_ISSET(fd, PrevWriteEvent))
+ continue;
+ if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
+ perror("epoll_ctl(DEL)");
+ exit(1);
+ }
+ }
+ }
+ ((int*)PrevReadEvent)[fds] = rn;
+ ((int*)PrevWriteEvent)[fds] = wn;
+ }
+#endif
+ }
+
+ /* now let's wait for events */
+ status = epoll_wait(epoll_fd, epoll_events, maxfd, next_time);
+ tv_now(&now);
+
+ for (count = 0; count < status; count++) {
+ fd = epoll_events[count].data.fd;
+
+ if (fdtab[fd].state == FD_STCLOSE)
+ continue;
+
+ if (epoll_events[count].events & ( EPOLLIN | EPOLLERR | EPOLLHUP ))
+ fdtab[fd].read(fd);
+
+ if (fdtab[fd].state == FD_STCLOSE)
+ continue;
+
+ if (epoll_events[count].events & ( EPOLLOUT | EPOLLERR | EPOLLHUP ))
+ fdtab[fd].write(fd);
+ }
+ } else {
+ /* use select() */
+
+ if (next_time > 0) { /* FIXME */
+ /* Convert to timeval */
+ /* to avoid eventual select loops due to timer precision */
+ next_time += SCHEDULER_RESOLUTION;
+ delta.tv_sec = next_time / 1000;
+ delta.tv_usec = (next_time % 1000) * 1000;
+ }
+ else if (next_time == 0) { /* allow select to return immediately when needed */
+ delta.tv_sec = delta.tv_usec = 0;
+ }
+
+
+ /* let's restore fdset state */
+
+ readnotnull = 0; writenotnull = 0;
+ for (i = 0; i < (maxfd + FD_SETSIZE - 1)/(8*sizeof(int)); i++) {
+ readnotnull |= (*(((int*)ReadEvent)+i) = *(((int*)StaticReadEvent)+i)) != 0;
+ writenotnull |= (*(((int*)WriteEvent)+i) = *(((int*)StaticWriteEvent)+i)) != 0;
+ }
+
+ // /* just a verification code, needs to be removed for performance */
+ // for (i=0; i<maxfd; i++) {
+ // if (FD_ISSET(i, ReadEvent) != FD_ISSET(i, StaticReadEvent))
+ // abort();
+ // if (FD_ISSET(i, WriteEvent) != FD_ISSET(i, StaticWriteEvent))
+ // abort();
+ //
+ // }
+
+ status = select(maxfd,
+ readnotnull ? ReadEvent : NULL,
+ writenotnull ? WriteEvent : NULL,
+ NULL,
+ (next_time >= 0) ? &delta : NULL);
+
+ /* this is an experiment on the separation of the select work */
+ // status = (readnotnull ? select(maxfd, ReadEvent, NULL, NULL, (next_time >= 0) ? &delta : NULL) : 0);
+ // status |= (writenotnull ? select(maxfd, NULL, WriteEvent, NULL, (next_time >= 0) ? &delta : NULL) : 0);
+
+ tv_now(&now);
+
+ if (status > 0) { /* must proceed with events */
+
+ int fds;
+ char count;
+
+ for (fds = 0; (fds << INTBITS) < maxfd; fds++)
+ if ((((int *)(ReadEvent))[fds] | ((int *)(WriteEvent))[fds]) != 0)
+ for (count = 1<<INTBITS, fd = fds << INTBITS; count && fd < maxfd; count--, fd++) {
+
+ /* if we specify read first, the accepts and zero reads will be
+ * seen first. Moreover, system buffers will be flushed faster.
+ */
+ if (fdtab[fd].state == FD_STCLOSE)
+ continue;
+
+ if (FD_ISSET(fd, ReadEvent))
+ fdtab[fd].read(fd);
+
+ if (FD_ISSET(fd, WriteEvent))
+ fdtab[fd].write(fd);
+ }
+ }
+ else {
+ // fprintf(stderr,"select returned %d, maxfd=%d\n", status, maxfd);
+ }
}
}
}
@@ -6795,6 +6994,8 @@
display_version();
exit(0);
}
+ else if (*flag == 'e')
+ cfg_use_epoll = 1;
else if (*flag == 'V')
arg_mode |= MODE_VERBOSE;
else if (*flag == 'd')
@@ -6897,6 +7098,22 @@
sizeof(fd_set) *
(global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
+ if (cfg_use_epoll) {
+ epoll_fd = epoll_create(global.maxsock + 1);
+ if (epoll_fd < 0) {
+ Warning("epoll() is not available. Using select() instead.\n");
+ cfg_use_epoll = 0;
+ } else {
+ epoll_events = (struct epoll_event*) calloc(1, sizeof(struct epoll_event) * global.maxsock);
+ PrevReadEvent = (fd_set *)calloc(1,
+ sizeof(fd_set) *
+ (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
+ PrevWriteEvent = (fd_set *)calloc(1,
+ sizeof(fd_set) *
+ (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE);
+ }
+ }
+
fdtab = (struct fdtab *)calloc(1,
sizeof(struct fdtab) * (global.maxsock));
for (i = 0; i < global.maxsock; i++) {
@@ -7109,6 +7326,8 @@
if (ReadEvent) free(ReadEvent);
if (WriteEvent) free(WriteEvent);
+ if (PrevReadEvent) free(ReadEvent);
+ if (PrevWriteEvent) free(WriteEvent);
if (StaticReadEvent) free(StaticReadEvent);
if (StaticWriteEvent) free(StaticWriteEvent);
if (fdtab) free(fdtab);