OPTIM/MAJOR: ev_sepoll: process spec events after polled events

A suboptimal behaviour was appearing quite often with sepoll. When a
speculative write failed after a connect(), the socket was added to
the poll list using epoll_ctl(ADD). Then when epoll_wait() returned a
write event, the send() was performed and write event disabled, causing
it to get back to the spec list in order to be disabled later. But if
some new accept() did succeed in the same run, then fd_created was not
null, causing a new run of the spec list to happen. This run would then
detect the old event in STOP state and would remove it from the poll
list using epoll_ctl(DEL).

After this, process_session() enables reading on the FD, attempting
an speculative recv() which fails then adds it again using epoll_ctl(ADD)
to do it again. So the total sequence of syscalls looked like this :

connect(fd) = EAGAIN
send(fd) = EAGAIN
epoll_ctl(ADD(fd:OUT))
epoll_wait() = fd:OUT
send(fd) > 0
epoll_ctl(DEL(fd))
recv(fd) = EAGAIN
epoll_ctl(ADD(fd:IN))
recv(fd) > 0

In order to fix this stupid situation, we must compute the epoll_ctl()
parameters at the last moment, just before doing epoll_wait(). This is
what was done except that the spec events were processed just before doing
that without leaving time for the tasks to adjust the FDs if needed. This
is also the reason we have the re_poll_once label to try to catch new
events in case of a successful accept().

The new solution consists in doing the opposite :

  - compute epoll_ctl()
  - call epoll_wait()
  - call spec events

This significantly reduces the number of iterations on the spec events
and avoids a huge number of epoll_ctl() ping/pongs. The new sequence
above simply becomes :

connect(fd) = EAGAIN
send(fd) = EAGAIN
epoll_ctl(ADD(fd:OUT))
epoll_wait() = fd:OUT
send(fd) > 0
epoll_ctl(MOD(fd:IN))
recv(fd) > 0

Also, there is no need to re-run the spec events after an accept() as
it will automatically be detected in the spec list after a return from
polled events.

The gains are important, with up to 4.5% global performance increase in
connection rate on HTTP with small objects. The code is less tricky and
does not need anymore to skip epoll_wait() every other call, nor to
track the number of FDs newly created.
diff --git a/src/ev_sepoll.c b/src/ev_sepoll.c
index a7fb64c..c8444be 100644
--- a/src/ev_sepoll.c
+++ b/src/ev_sepoll.c
@@ -105,10 +105,13 @@
  * The FD array has to hold a back reference to the speculative list. This
  * reference is only valid if at least one of the directions is marked SPEC.
  *
+ * We store the FD state in the 4 lower bits of fdtab[fd].spec.e, and save the
+ * previous state upon changes in the 4 higher bits, so that changes are easy
+ * to spot.
  */
 
-#define FD_EV_IN_SL	1
-#define FD_EV_IN_PL	4
+#define FD_EV_IN_SL	1U
+#define FD_EV_IN_PL	4U
 
 #define FD_EV_IDLE	0
 #define FD_EV_SPEC	(FD_EV_IN_SL)
@@ -133,6 +136,7 @@
 #define FD_EV_MASK_W	(FD_EV_MASK_R << 1)
 
 #define FD_EV_MASK	(FD_EV_MASK_W | FD_EV_MASK_R)
+#define FD_EV_MASK_OLD	(FD_EV_MASK << 4)
 
 /* This is the minimum number of events successfully processed in speculative
  * mode above which we agree to return without checking epoll() (1/2 times).
@@ -141,7 +145,6 @@
 
 static int nbspec = 0;          // current size of the spec list
 static int absmaxevents = 0;    // absolute maximum amounts of polled events
-static int fd_created = 0;      // fd creation detector, reset upon poll() entry.
 
 static unsigned int *spec_list = NULL;	// speculative I/O list
 
@@ -228,7 +231,6 @@
 		if (unlikely(i != FD_EV_IDLE))
 			return 0;
 		// switch to SPEC state and allocate a SPEC entry.
-		fd_created++;
 		alloc_spec_entry(fd);
 	}
 	fdtab[fd].spec.e ^= (unsigned int)(FD_EV_IN_SL << dir);
@@ -275,7 +277,7 @@
 REGPRM1 static void __fd_clo(int fd)
 {
 	release_spec_entry(fd);
-	fdtab[fd].spec.e &= ~(FD_EV_MASK);
+	fdtab[fd].spec.e &= ~(FD_EV_MASK | FD_EV_MASK_OLD);
 }
 
 /*
@@ -283,101 +285,51 @@
  */
 REGPRM2 static void _do_poll(struct poller *p, int exp)
 {
-	static unsigned int last_skipped;
-	static unsigned int spec_processed;
-	int status, eo;
+	int status, eo, en;
 	int fd, opcode;
 	int count;
 	int spec_idx;
 	int wait_time;
-	int looping = 0;
 
-
- re_poll_once:
-	/* Here we have two options :
-	 * - either walk the list forwards and hope to match more events
-	 * - or walk it backwards to minimize the number of changes and
-	 *   to make better use of the cache.
-	 * Tests have shown that walking backwards improves perf by 0.2%.
-	 */
-
-	status = 0;
+	/* first, update the poll list according to what changed in the spec list */
 	spec_idx = nbspec;
 	while (likely(spec_idx > 0)) {
-		int done;
-
 		spec_idx--;
 		fd = spec_list[spec_idx];
-		eo = fdtab[fd].spec.e;  /* save old events */
+		en = fdtab[fd].spec.e & 15;  /* new events */
+		eo = fdtab[fd].spec.e >> 4;  /* previous events */
 
-		if (looping && --fd_created < 0) {
-			/* we were just checking the newly created FDs */
-			break;
-		}
-		/*
-		 * Process the speculative events.
-		 *
-		 * Principle: events which are marked FD_EV_SPEC are processed
-		 * with their assigned function. If the function returns 0, it
-		 * means there is nothing doable without polling first. We will
-		 * then convert the event to a pollable one by assigning them
-		 * the WAIT status.
+		/* If an fd with a poll bit is present here, it means that it
+		 * has last requested a poll, or is leaving from a poll. Given
+		 * that an FD is fully in the poll list or in the spec list but
+		 * not in both at once, we'll first ensure that if it is
+		 * already being polled in one direction and requests a spec
+		 * access, then we'll turn that into a polled access in order
+		 * to save a syscall which will likely return EAGAIN.
 		 */
 
-#ifdef DEBUG_DEV
-		if (fdtab[fd].state == FD_STCLOSE) {
-			fprintf(stderr,"fd=%d, fdtab[].ev=%x, fdtab[].spec.e=%x, .s=%d, idx=%d\n",
-				fd, fdtab[fd].ev, fdtab[fd].spec.e, fdtab[fd].spec.s1, spec_idx);
-		}
-#endif
-		done = 0;
-		fdtab[fd].ev &= FD_POLL_STICKY;
-		if ((eo & FD_EV_MASK_R) == FD_EV_SPEC_R) {
-			/* The owner is interested in reading from this FD */
-			if (fdtab[fd].state != FD_STERROR) {
-				/* Pretend there is something to read */
-				fdtab[fd].ev |= FD_POLL_IN;
-				if (!fdtab[fd].cb[DIR_RD].f(fd))
-					fdtab[fd].spec.e ^= (FD_EV_WAIT_R ^ FD_EV_SPEC_R);
-				else
-					done = 1;
-			}
+		if ((en & FD_EV_RW_PL) && (en & FD_EV_RW_SL)) {
+			/* convert SPEC to WAIT if fd already being polled */
+			if ((en & FD_EV_MASK_R) == FD_EV_SPEC_R)
+				en = (en & ~FD_EV_MASK_R) | FD_EV_WAIT_R;
+
+			if ((en & FD_EV_MASK_W) == FD_EV_SPEC_W)
+				en = (en & ~FD_EV_MASK_W) | FD_EV_WAIT_W;
 		}
-		else if ((eo & FD_EV_MASK_R) == FD_EV_STOP_R) {
+
+		if ((en & FD_EV_MASK_R) == FD_EV_STOP_R) {
 			/* This FD was being polled and is now being removed. */
-			fdtab[fd].spec.e &= ~FD_EV_MASK_R;
-		}
-		
-		if ((eo & FD_EV_MASK_W) == FD_EV_SPEC_W) {
-			/* The owner is interested in writing to this FD */
-			if (fdtab[fd].state != FD_STERROR) {
-				/* Pretend there is something to write */
-				fdtab[fd].ev |= FD_POLL_OUT;
-				if (!fdtab[fd].cb[DIR_WR].f(fd))
-					fdtab[fd].spec.e ^= (FD_EV_WAIT_W ^ FD_EV_SPEC_W);
-				else
-					done = 1;
-			}
+			en &= ~FD_EV_MASK_R;
 		}
-		else if ((eo & FD_EV_MASK_W) == FD_EV_STOP_W) {
+
+		if ((en & FD_EV_MASK_W) == FD_EV_STOP_W) {
 			/* This FD was being polled and is now being removed. */
-			fdtab[fd].spec.e &= ~FD_EV_MASK_W;
+			en &= ~FD_EV_MASK_W;
 		}
 
-		status += done;
-		/* one callback might already have closed the fd by itself */
-		if (fdtab[fd].state == FD_STCLOSE)
-			continue;
-
-		/* Now, we will adjust the event in the poll list. Indeed, it
-		 * is possible that an event which was previously in the poll
-		 * list now goes out, and the opposite is possible too. We can
-		 * have opposite changes for READ and WRITE too.
-		 */
-
-		if ((eo ^ fdtab[fd].spec.e) & FD_EV_RW_PL) {
-			/* poll status changed*/
-			if ((fdtab[fd].spec.e & FD_EV_RW_PL) == 0) {
+		if ((eo ^ en) & FD_EV_RW_PL) {
+			/* poll status changed */
+			if ((en & FD_EV_RW_PL) == 0) {
 				/* fd removed from poll list */
 				opcode = EPOLL_CTL_DEL;
 			}
@@ -392,63 +344,32 @@
 
 			/* construct the epoll events based on new state */
 			ev.events = 0;
-			if (fdtab[fd].spec.e & FD_EV_WAIT_R)
+			if (en & FD_EV_WAIT_R)
 				ev.events |= EPOLLIN;
 
-			if (fdtab[fd].spec.e & FD_EV_WAIT_W)
+			if (en & FD_EV_WAIT_W)
 				ev.events |= EPOLLOUT;
 
 			ev.data.fd = fd;
 			epoll_ctl(epoll_fd, opcode, fd, &ev);
 		}
 
+		fdtab[fd].spec.e = (en << 4) + en;  /* save new events */
 
 		if (!(fdtab[fd].spec.e & FD_EV_RW_SL)) {
 			/* This fd switched to combinations of either WAIT or
 			 * IDLE. It must be removed from the spec list.
 			 */
 			release_spec_entry(fd);
-			continue;
 		}
 	}
 
-	/* It may make sense to immediately return here if there are enough
-	 * processed events, without passing through epoll_wait() because we
-	 * have exactly done a poll.
-	 * Measures have shown a great performance increase if we call the
-	 * epoll_wait() only the second time after speculative accesses have
-	 * succeeded. This reduces the number of unsucessful calls to
-	 * epoll_wait() by a factor of about 3, and the total number of calls
-	 * by about 2.
-	 * However, when we do that after having processed too many events,
-	 * events waiting in epoll() starve for too long a time and tend to
-	 * become themselves eligible for speculative polling. So we try to
-	 * limit this practise to reasonable situations.
-	 */
-
-	spec_processed += status;
+	/* compute the epoll_wait() timeout */
 
-	if (looping) {
-		last_skipped++;
-		return;
-	}
-
-	if (status >= MIN_RETURN_EVENTS && spec_processed < absmaxevents) {
-		/* We have processed at least MIN_RETURN_EVENTS, it's worth
-		 * returning now without checking epoll_wait().
-		 */
-		if (++last_skipped <= 1) {
-			tv_update_date(0, 1);
-			return;
-		}
-	}
-	last_skipped = 0;
-
-	if (nbspec || status || run_queue || signal_queue_len) {
-		/* Maybe we have processed some events that we must report, or
-		 * maybe we still have events in the spec list, or there are
+	if (nbspec || run_queue || signal_queue_len) {
+		/* Maybe we still have events in the spec list, or there are
 		 * some tasks left pending in the run_queue, so we must not
-		 * wait in epoll() otherwise we will delay their delivery by
+		 * wait in epoll() otherwise we would delay their delivery by
 		 * the next timeout.
 		 */
 		wait_time = 0;
@@ -465,21 +386,14 @@
 		}
 	}
 
-	/* now let's wait for real events. We normally use maxpollevents as a
-	 * high limit, unless <nbspec> is already big, in which case we need
-	 * to compensate for the high number of events processed there.
-	 */
-	fd = MIN(absmaxevents, spec_processed);
-	fd = MAX(global.tune.maxpollevents, fd);
-	fd = MIN(maxfd, fd);
-	/* we want to detect if an accept() will create new speculative FDs here */
-	fd_created = 0;
-	spec_processed = 0;
+	/* now let's wait for real events */
+	fd = MIN(maxfd, global.tune.maxpollevents);
 	gettimeofday(&before_poll, NULL);
 	status = epoll_wait(epoll_fd, epoll_events, fd, wait_time);
 	tv_update_date(wait_time, status);
 	measure_idle();
 
+	/* process events */
 	for (count = 0; count < status; count++) {
 		int e = epoll_events[count].events;
 		fd = epoll_events[count].data.fd;
@@ -487,10 +401,6 @@
 		/* it looks complicated but gcc can optimize it away when constants
 		 * have same values.
 		 */
-		DPRINTF(stderr, "%s:%d: fd=%d, ev=0x%08x, e=0x%08x\n",
-			__FUNCTION__, __LINE__,
-			fd, fdtab[fd].ev, e);
-
 		fdtab[fd].ev &= FD_POLL_STICKY;
 		fdtab[fd].ev |= 
 			((e & EPOLLIN ) ? FD_POLL_IN  : 0) |
@@ -514,17 +424,64 @@
 		}
 	}
 
+	/* now process speculative events if any */
+
+	/* Here we have two options :
+	 * - either walk the list forwards and hope to match more events
+	 * - or walk it backwards to minimize the number of changes and
+	 *   to make better use of the cache.
+	 * Tests have shown that walking backwards improves perf by 0.2%.
+	 */
+
+	spec_idx = nbspec;
+	while (likely(spec_idx > 0)) {
+		spec_idx--;
+		fd = spec_list[spec_idx];
+		eo = fdtab[fd].spec.e;  /* save old events */
+
-	if (fd_created) {
-		/* we have created some fds, certainly in return of an accept(),
-		 * and they're marked as speculative. If we can manage to perform
-		 * a read(), we're almost sure to collect all the request at once
-		 * and avoid several expensive wakeups. So let's try now. Anyway,
-		 * if we fail, the tasks are still woken up, and the FD gets marked
-		 * for poll mode.
+		/*
+		 * Process the speculative events.
+		 *
+		 * Principle: events which are marked FD_EV_SPEC are processed
+		 * with their assigned function. If the function returns 0, it
+		 * means there is nothing doable without polling first. We will
+		 * then convert the event to a pollable one by assigning them
+		 * the WAIT status.
 		 */
-		looping = 1;
-		goto re_poll_once;
+
+		fdtab[fd].ev &= FD_POLL_STICKY;
+		if ((eo & FD_EV_MASK_R) == FD_EV_SPEC_R) {
+			/* The owner is interested in reading from this FD */
+			if (fdtab[fd].state != FD_STERROR) {
+				/* Pretend there is something to read */
+				fdtab[fd].ev |= FD_POLL_IN;
+				if (!fdtab[fd].cb[DIR_RD].f(fd))
+					fdtab[fd].spec.e ^= (FD_EV_WAIT_R ^ FD_EV_SPEC_R);
+			}
+		}
+
+		if ((eo & FD_EV_MASK_W) == FD_EV_SPEC_W) {
+			/* The owner is interested in writing to this FD */
+			if (fdtab[fd].state != FD_STERROR) {
+				/* Pretend there is something to write */
+				fdtab[fd].ev |= FD_POLL_OUT;
+				if (!fdtab[fd].cb[DIR_WR].f(fd))
+					fdtab[fd].spec.e ^= (FD_EV_WAIT_W ^ FD_EV_SPEC_W);
+			}
+		}
+
+		/* one callback might already have closed the fd by itself */
+		if (fdtab[fd].state == FD_STCLOSE)
+			continue;
+
+		if (!(fdtab[fd].spec.e & (FD_EV_RW_SL|FD_EV_RW_PL))) {
+			/* This fd switched to IDLE, it can be removed from the spec list. */
+			release_spec_entry(fd);
+			continue;
+		}
 	}
+
+	/* in the end, we have processed status + spec_processed FDs */
 }
 
 /*