REORG: fd: move the speculative I/O management from ev_sepoll

The speculative I/O will need to be ported to all pollers, so move
this to fd.c.
diff --git a/src/fd.c b/src/fd.c
index 9a73d35..5d63cc3 100644
--- a/src/fd.c
+++ b/src/fd.c
@@ -1,13 +1,85 @@
 /*
  * File descriptors management functions.
  *
- * Copyright 2000-2008 Willy Tarreau <w@1wt.eu>
+ * Copyright 2000-2012 Willy Tarreau <w@1wt.eu>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  *
+ * This code implements "speculative I/O". The principle is to try to perform
+ * expected I/O before registering the events in the poller. Each time this
+ * succeeds, it saves a possibly expensive system call to set the event. It
+ * generally succeeds for all reads after an accept(), and for writes after a
+ * connect(). It also improves performance for streaming connections because
+ * even if only one side is polled, the other one may react accordingly
+ * depending on the fill level of the buffer. This behaviour is also the only
+ * one compatible with event-based pollers (eg: EPOLL_ET).
+ *
+ * More importantly, it enables I/O operations that are backed by invisible
+ * buffers. For example, SSL is able to read a whole socket buffer and not
+ * deliver it to the application buffer because it's full. Unfortunately, it
+ * won't be reported by a poller anymore until some new activity happens. The
+ * only way to call it again thus is to perform speculative I/O as soon as
+ * reading on the FD is enabled again.
+ *
+ * The speculative I/O uses a list of expected events and a list of updates.
+ * Expected events are events that are expected to come and that we must report
+ * to the application until it asks to stop or to poll. Updates are new requests
+ * for changing an FD state. Updates are the only way to create new events. This
+ * is important because it means that the number of speculative events cannot
+ * increase between updates and will only grow one at a time while processing
+ * updates. All updates must always be processed, though events might be
+ * processed by small batches if required.
+ *
+ * There is no direct link between the FD and the updates list. There is only a
+ * bit in the fdtab[] to indicate than a file descriptor is already present in
+ * the updates list. Once an fd is present in the updates list, it will have to
+ * be considered even if its changes are reverted in the middle or if the fd is
+ * replaced.
+ *
+ * It is important to understand that as long as all expected events are
+ * processed, they might starve the polled events, especially because polled
+ * I/O starvation quickly induces more speculative I/O. One solution to this
+ * consists in only processing a part of the events at once, but one drawback
+ * is that unhandled events will still wake the poller up. Using an event-driven
+ * poller such as EPOLL_ET will solve this issue though.
+ *
+ * A file descriptor has a distinct state for each direction. This state is a
+ * combination of two bits :
+ *  bit 0 = active Y/N : is set if the FD is active, which means that its
+ *          handler will be called without prior polling ;
+ *  bit 1 = polled Y/N : is set if the FD was subscribed to polling
+ *
+ * It is perfectly valid to have both bits set at a time, which generally means
+ * that the FD was reported by polling, was marked active and not yet unpolled.
+ * Such a state must not last long to avoid unneeded wakeups.
+ *
+ * The state of the FD as of last change is preserved in two other bits. These
+ * ones are useful to save a significant amount of system calls during state
+ * changes, because there is no need to update the FD status in the system until
+ * we're about to call the poller.
+ *
+ * Since we do not want to scan all the FD list to find speculative I/O events,
+ * we store them in a list consisting in a linear array holding only the FD
+ * indexes right now. Note that a closed FD cannot exist in the spec list,
+ * because it is closed by fd_delete() which in turn calls __fd_clo() which
+ * always removes it from the list.
+ *
+ * For efficiency reasons, we will store the Read and Write bits interlaced to
+ * form a 4-bit field, so that we can simply shift the value right by 0/1 and
+ * get what we want :
+ *    3  2  1  0
+ *   Wp Rp Wa Ra
+ *
+ * The FD array has to hold a back reference to the speculative list. This
+ * reference is always valid unless the FD if currently being polled and not
+ * updated (in which case the reference points to index 0).
+ *
+ * We store the FD state in the 4 lower bits of fdtab[fd].spec_e, and save the
+ * previous state upon changes in the 4 higher bits, so that changes are easy
+ * to spot.
  */
 
 #include <stdio.h>
@@ -18,6 +90,8 @@
 #include <common/compat.h>
 #include <common/config.h>
 
+#include <types/global.h>
+
 #include <proto/fd.h>
 #include <proto/port_range.h>
 
@@ -31,6 +105,11 @@
 struct poller cur_poller;
 int nbpollers = 0;
 
+/* FD status is defined by the poller's status and by the speculative I/O list */
+int fd_nbspec = 0;             // number of speculative events in the list
+int fd_nbupdt = 0;             // number of updates in the list
+unsigned int *fd_spec = NULL;  // speculative I/O list
+unsigned int *fd_updt = NULL;  // FD updates list
 
 /* Deletes an FD from the fdsets, and recomputes the maxfd limit.
  * The file descriptor is also closed.
@@ -68,6 +147,11 @@
 	int p;
 	struct poller *bp;
 
+	if ((fd_spec = (uint32_t *)calloc(1, sizeof(uint32_t) * global.maxsock)) == NULL)
+		goto fail_spec;
+
+	if ((fd_updt = (uint32_t *)calloc(1, sizeof(uint32_t) * global.maxsock)) == NULL)
+		goto fail_updt;
 
 	do {
 		bp = NULL;
@@ -84,6 +168,11 @@
 		}
 	} while (!bp || bp->pref == 0);
 	return 0;
+
+ fail_updt:
+	free(fd_spec);
+ fail_spec:
+	return 0;
 }
 
 /*
@@ -100,6 +189,11 @@
 		if (bp && bp->pref)
 			bp->term(bp);
 	}
+
+	free(fd_updt);
+	free(fd_spec);
+	fd_updt = NULL;
+	fd_spec = NULL;
 }
 
 /*