[MAJOR] ev_epoll: do not rely on fd_sets anymore

The new epoll-based poller uses a list of changes in order to
process only the fds which have changed.
diff --git a/src/ev_epoll.c b/src/ev_epoll.c
index e37fe0d..89315ea 100644
--- a/src/ev_epoll.c
+++ b/src/ev_epoll.c
@@ -16,6 +16,7 @@
 
 #include <common/compat.h>
 #include <common/config.h>
+#include <common/standard.h>
 #include <common/time.h>
 
 #include <types/fd.h>
@@ -35,67 +36,185 @@
 #include <sys/epoll.h>
 #endif
 
+/* This is what we store in a list. It consists in old values and fds to detect changes. */
+struct fd_chg {
+	unsigned int prev:2;	// previous state mask. New one is in fd_evts.
+	unsigned int fd:30;	// file descriptor
+};
 
-static fd_set *fd_evts[2];
-static fd_set *old_evts[2];
+static int nbchanges = 0;		// number of changes pending
+static struct fd_chg *chg_list = NULL;	// list of changes
+static struct fd_chg **chg_ptr = NULL;	// per-fd changes
+
+/* Each 32-bit word contains 2-bit descriptors of the latest state for 16 FDs :
+ *   desc = (u32 >> (2*fd)) & 3
+ *   desc = 0 : FD not set
+ *          1 : WRITE not set, READ set
+ *          2 : WRITE set, READ not set
+ *          3 : WRITE set, READ set
+ */
+
+static uint32_t *fd_evts;
 
 /* private data */
 static struct epoll_event *epoll_events;
 static int epoll_fd;
 
+/* This structure may be used for any purpose. Warning! do not use it in
+ * recursive functions !
+ */
+static struct epoll_event ev;
+
+/* converts a direction to a single bitmask.
+ *  0 => 1
+ *  1 => 2
+ */
+#define DIR2MSK(dir) ((dir) + 1)
+
+/* converts an FD to an fd_evts offset and to a bit shift */
+#define FD2OFS(fd)   ((uint32_t)(fd) >> 4)
+#define FD2BIT(fd)   (((uint32_t)(fd) & 0xF) << 1)
+#define FD2MSK(fd)   (3 << FD2BIT(fd))
 
 /*
- * Benchmarks performed on a Pentium-M notebook show that using functions
- * instead of the usual macros improve the FD_* performance by about 80%,
- * and that marking them regparm(2) adds another 20%.
+ * Returns non-zero if direction <dir> is already set for <fd>.
  */
 REGPRM2 static int __fd_is_set(const int fd, int dir)
 {
-	return FD_ISSET(fd, fd_evts[dir]);
+	return (fd_evts[FD2OFS(fd)] >> FD2BIT(fd)) & DIR2MSK(dir);
 }
 
-REGPRM2 static int __fd_set(const int fd, int dir)
+/*
+ * Adds, mods or deletes <fd> according to current status and to new desired
+ * mask <dmask> :
+ *
+ *    0 = nothing
+ *    1 = EPOLLIN
+ *    2 = EPOLLOUT
+ *    3 = EPOLLIN | EPOLLOUT
+ *
+ */
+static int dmsk2event[4] = { 0, EPOLLIN, EPOLLOUT, EPOLLIN | EPOLLOUT };
+
+
+REGPRM2 static void fd_flush_changes()
 {
-	FD_SET(fd, fd_evts[dir]);
-	return 0;
+	uint32_t ofs;
+	int opcode;
+	int prev, next;
+	int chg, fd;
+
+	for (chg = 0; chg < nbchanges; chg++) {
+		prev = chg_list[chg].prev;
+		fd = chg_list[chg].fd;
+		chg_ptr[fd] = NULL;
+
+		ofs = FD2OFS(fd);
+		next = (fd_evts[ofs] >> FD2BIT(fd)) & 3;
+
+		if (prev == next)
+			/* if the value was unchanged, do nothing */
+			continue;
+
+		ev.events = dmsk2event[next];
+		ev.data.fd = fd;
+
+		if (prev) {
+			if (!next) {
+				/* we want to delete it now */
+				opcode = EPOLL_CTL_DEL;
+			} else {
+				/* we want to switch it */
+				opcode = EPOLL_CTL_MOD;
+			}
+		} else {
+			/* the FD did not exist, let's add it */
+			opcode = EPOLL_CTL_ADD;
+		}
+
+		epoll_ctl(epoll_fd, opcode, fd, &ev);
+	}
+	nbchanges = 0;
 }
 
-REGPRM2 static int __fd_clr(const int fd, int dir)
+REGPRM2 static void alloc_chg_list(const int fd, int old_evt)
 {
-	FD_CLR(fd, fd_evts[dir]);
-	return 0;
+	struct fd_chg *ptr;
+
+	if (unlikely(chg_ptr[fd]))
+		return;
+
+#if LIMIT_NUMBER_OF_CHANGES
+	if (nbchanges > 2)
+		fd_flush_changes();
+#endif
+
+	ptr = &chg_list[nbchanges++];
+	chg_ptr[fd] = ptr;
+	ptr->fd = fd;
+	ptr->prev = old_evt;
 }
 
-REGPRM2 static int __fd_cond_s(const int fd, int dir)
+REGPRM2 static int __fd_set(const int fd, int dir)
 {
-	int ret;
-	ret = !FD_ISSET(fd, fd_evts[dir]);
-	if (ret)
-		FD_SET(fd, fd_evts[dir]);
-	return ret;
+	uint32_t ofs = FD2OFS(fd);
+	uint32_t dmsk = DIR2MSK(dir);
+	uint32_t old_evt;
+
+	old_evt = fd_evts[ofs] >> FD2BIT(fd);
+	old_evt &= 3;
+	if (unlikely(old_evt & dmsk))
+		return 0;
+
+	alloc_chg_list(fd, old_evt);
+	dmsk <<= FD2BIT(fd);
+	fd_evts[ofs] |= dmsk;
+	return 1;
 }
 
-REGPRM2 static int __fd_cond_c(const int fd, int dir)
+REGPRM2 static int __fd_clr(const int fd, int dir)
 {
-	int ret;
-	ret = FD_ISSET(fd, fd_evts[dir]);
-	if (ret)
-		FD_CLR(fd, fd_evts[dir]);
-	return ret;
+	uint32_t ofs = FD2OFS(fd);
+	uint32_t dmsk = DIR2MSK(dir);
+	uint32_t old_evt;
+
+	old_evt = fd_evts[ofs] >> FD2BIT(fd);
+	old_evt &= 3;
+	if (unlikely(!(old_evt & dmsk)))
+		return 0;
+
+	alloc_chg_list(fd, old_evt);
+	dmsk <<= FD2BIT(fd);
+	fd_evts[ofs] &= ~dmsk;
+	return 1;
 }
 
-REGPRM1 static void __fd_rem(const int fd)
+REGPRM1 static void __fd_rem(int fd)
 {
-	FD_CLR(fd, fd_evts[DIR_RD]);
-	FD_CLR(fd, fd_evts[DIR_WR]);
+	uint32_t ofs = FD2OFS(fd);
+
+	if (unlikely(!((fd_evts[ofs] >> FD2BIT(fd)) & 3)))
+		return;
+
+	alloc_chg_list(fd, 0);
+	fd_evts[ofs] &= ~FD2MSK(fd);
+	return;
 }
 
-REGPRM1 static void __fd_clo(const int fd)
+/*
+ * On valid epoll() implementations, a call to close() automatically removes
+ * the fds. This means that the FD will appear as previously unset.
+ */
+REGPRM1 static void __fd_clo(int fd)
 {
-	FD_CLR(fd, fd_evts[DIR_RD]);
-	FD_CLR(fd, fd_evts[DIR_WR]);
-	FD_CLR(fd, old_evts[DIR_RD]);
-	FD_CLR(fd, old_evts[DIR_WR]);
+	struct fd_chg *ptr;
+	fd_evts[FD2OFS(fd)] &= ~FD2MSK(fd);
+	ptr = chg_ptr[fd];
+	if (ptr) {
+		ptr->prev = 0;
+		chg_ptr[fd] = NULL;
+	}
+	return;
 }
 
 /*
@@ -105,93 +224,11 @@
 {
 	int status;
 	int fd;
-
-	int fds, count;
-	int pr, pw, sr, sw;
-	unsigned rn, ro, wn, wo; /* read new, read old, write new, write old */
-	struct epoll_event ev;
-
-	for (fds = 0; (fds << INTBITS) < maxfd; fds++) {
-	  
-		rn = ((int*)fd_evts[DIR_RD])[fds];  ro = ((int*)old_evts[DIR_RD])[fds];
-		wn = ((int*)fd_evts[DIR_WR])[fds]; wo = ((int*)old_evts[DIR_WR])[fds];
-	  
-		if ((ro^rn) | (wo^wn)) {
-			for (count = 0, fd = fds << INTBITS; count < (1<<INTBITS) && fd < maxfd; count++, fd++) {
-#define FDSETS_ARE_INT_ALIGNED
-#ifdef FDSETS_ARE_INT_ALIGNED
+	int count;
 
-#define WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
-#ifdef WE_REALLY_NOW_THAT_FDSETS_ARE_INTS
-				pr = (ro >> count) & 1;
-				pw = (wo >> count) & 1;
-				sr = (rn >> count) & 1;
-				sw = (wn >> count) & 1;
-#else
-				pr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&ro);
-				pw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wo);
-				sr = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&rn);
-				sw = FD_ISSET(fd&((1<<INTBITS)-1), (typeof(fd_set*))&wn);
-#endif
-#else
-				pr = FD_ISSET(fd, old_evts[DIR_RD]);
-				pw = FD_ISSET(fd, old_evts[DIR_WR]);
-				sr = FD_ISSET(fd, fd_evts[DIR_RD]);
-				sw = FD_ISSET(fd, fd_evts[DIR_WR]);
-#endif
-				if (!((sr^pr) | (sw^pw)))
-					continue;
+	if (likely(nbchanges))
+		fd_flush_changes();
 
-				ev.events = (sr ? EPOLLIN : 0) | (sw ? EPOLLOUT : 0);
-				ev.data.fd = fd;
-
-#ifdef EPOLL_CTL_MOD_WORKAROUND
-				/* I encountered a rarely reproducible problem with
-				 * EPOLL_CTL_MOD where a modified FD (systematically
-				 * the one in epoll_events[0], fd#7) would sometimes
-				 * be set EPOLL_OUT while asked for a read ! This is
-				 * with the 2.4 epoll patch. The workaround is to
-				 * delete then recreate in case of modification.
-				 * This is in 2.4 up to epoll-lt-0.21 but not in 2.6
-				 * nor RHEL kernels.
-				 */
-
-				if ((pr | pw) && fdtab[fd].state != FD_STCLOSE)
-					epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev);
-
-				if ((sr | sw))
-					epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev);
-#else
-				if ((pr | pw)) {
-					/* the file-descriptor already exists... */
-					if ((sr | sw)) {
-						/* ...and it will still exist */
-						if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, fd, &ev) < 0) {
-							// perror("epoll_ctl(MOD)");
-							// exit(1);
-						}
-					} else {
-						/* ...and it will be removed */
-						if (fdtab[fd].state != FD_STCLOSE &&
-						    epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) {
-							// perror("epoll_ctl(DEL)");
-							// exit(1);
-						}
-					}
-				} else {
-					/* the file-descriptor did not exist, let's add it */
-					if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
-						// perror("epoll_ctl(ADD)");
-						//  exit(1);
-					}
-				}
-#endif // EPOLL_CTL_MOD_WORKAROUND
-			}
-			((int*)old_evts[DIR_RD])[fds] = rn;
-			((int*)old_evts[DIR_WR])[fds] = wn;
-		}		  
-	}
-      
 	/* now let's wait for events */
 	status = epoll_wait(epoll_fd, epoll_events, maxfd, wait_time);
 	tv_now(&now);
@@ -199,14 +236,14 @@
 	for (count = 0; count < status; count++) {
 		fd = epoll_events[count].data.fd;
 
-		if (FD_ISSET(fd, fd_evts[DIR_RD])) {
+		if ((fd_evts[FD2OFS(fd)] >> FD2BIT(fd)) & DIR2MSK(DIR_RD)) {
 			if (fdtab[fd].state == FD_STCLOSE)
 				continue;
 			if (epoll_events[count].events & ( EPOLLIN | EPOLLERR | EPOLLHUP ))
 				fdtab[fd].cb[DIR_RD].f(fd);
 		}
 
-		if (FD_ISSET(fd, fd_evts[DIR_WR])) {
+		if ((fd_evts[FD2OFS(fd)] >> FD2BIT(fd)) & DIR2MSK(DIR_WR)) {
 			if (fdtab[fd].state == FD_STCLOSE)
 				continue;
 			if (epoll_events[count].events & ( EPOLLOUT | EPOLLERR | EPOLLHUP ))
@@ -222,11 +259,11 @@
  */
 REGPRM1 static int epoll_init(struct poller *p)
 {
-	__label__ fail_pwevt, fail_prevt, fail_swevt, fail_srevt, fail_ee, fail_fd;
+	__label__ fail_chg_ptr, fail_chg_list, fail_fdevt, fail_ee, fail_fd;
 	int fd_set_bytes;
 
 	p->private = NULL;
-	fd_set_bytes = sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE;
+	fd_set_bytes = 4 * (global.maxsock + 15) / 16;
 
 	epoll_fd = epoll_create(global.maxsock + 1);
 	if (epoll_fd < 0)
@@ -238,27 +275,24 @@
 	if (epoll_events == NULL)
 		goto fail_ee;
 
-	if ((old_evts[DIR_RD] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
-		goto fail_prevt;
+	if ((fd_evts = (uint32_t *)calloc(1, fd_set_bytes)) == NULL)
+		goto fail_fdevt;
 
-	if ((old_evts[DIR_WR] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
-		goto fail_pwevt;
-		
-	if ((fd_evts[DIR_RD] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
-		goto fail_srevt;
+	chg_list = (struct fd_chg *)calloc(1, sizeof(struct fd_chg) * global.maxsock);
+	if (chg_list == NULL)
+		goto fail_chg_list;
 
-	if ((fd_evts[DIR_WR] = (fd_set *)calloc(1, fd_set_bytes)) == NULL)
-		goto fail_swevt;
+	chg_ptr = (struct fd_chg **)calloc(1, sizeof(struct fd_chg*) * global.maxsock);
+	if (chg_ptr == NULL)
+		goto fail_chg_ptr;
 
 	return 1;
 
- fail_swevt:
-	free(fd_evts[DIR_RD]);
- fail_srevt:
-	free(old_evts[DIR_WR]);
- fail_pwevt:
-	free(old_evts[DIR_RD]);
- fail_prevt:
+ fail_chg_ptr:
+	free(chg_list);
+ fail_chg_list:
+	free(fd_evts);
+ fail_fdevt:
 	free(epoll_events);
  fail_ee:
 	close(epoll_fd);
@@ -274,24 +308,25 @@
  */
 REGPRM1 static void epoll_term(struct poller *p)
 {
-	if (fd_evts[DIR_WR])
-		free(fd_evts[DIR_WR]);
+	fd_flush_changes();
 
-	if (fd_evts[DIR_RD])
-		free(fd_evts[DIR_RD]);
-
-	if (old_evts[DIR_WR])
-		free(old_evts[DIR_WR]);
-
-	if (old_evts[DIR_RD])
-		free(old_evts[DIR_RD]);
-
+	if (chg_ptr)
+		free(chg_ptr);
+	if (chg_list)
+		free(chg_list);
+	if (fd_evts)
+		free(fd_evts);
 	if (epoll_events)
 		free(epoll_events);
 
 	close(epoll_fd);
 	epoll_fd = 0;
 
+	chg_ptr = NULL;
+	chg_list = NULL;
+	fd_evts = NULL;
+	epoll_events = NULL;
+
 	p->private = NULL;
 	p->pref = 0;
 }
@@ -324,13 +359,13 @@
 	p->init = epoll_init;
 	p->term = epoll_term;
 	p->poll = epoll_poll;
-	p->is_set = __fd_is_set;
-	p->set = __fd_set;
-	p->clr = __fd_clr;
+
+	p->is_set  = __fd_is_set;
+	p->cond_s = p->set = __fd_set;
+	p->cond_c = p->clr = __fd_clr;
 	p->rem = __fd_rem;
 	p->clo = __fd_clo;
-	p->cond_s = __fd_cond_s;
-	p->cond_c = __fd_cond_c;
+	
 	return 1;
 }