MAJOR: fd/threads: Make the fdcache mostly lockless.

Create a local, per-thread, fdcache, for file descriptors that only belongs
to one thread, and make the global fd cache mostly lockless, as we can get
a lot of contention on the fd cache lock.
diff --git a/include/proto/fd.h b/include/proto/fd.h
index a7e70b7..595af90 100644
--- a/include/proto/fd.h
+++ b/include/proto/fd.h
@@ -33,8 +33,9 @@
 
 /* public variables */
 
-extern unsigned int *fd_cache;      // FD events cache
-extern int fd_cache_num;            // number of events in the cache
+extern volatile struct fdlist fd_cache;
+extern volatile struct fdlist fd_cache_local[MAX_THREADS];
+
 extern unsigned long fd_cache_mask; // Mask of threads with events in the cache
 
 extern THREAD_LOCAL int *fd_updt;  // FD updates list
@@ -104,45 +105,224 @@
 	fd_updt[fd_nbupdt++] = fd;
 }
 
+
+#define _GET_NEXT(fd) fdtab[fd].fdcache_entry.next
+#define _GET_PREV(fd) fdtab[fd].fdcache_entry.prev
+
+static inline void fd_add_to_fd_list(volatile struct fdlist *list, int fd)
+{
+	int next;
+	int new;
+	int old;
+	int last;
+
+redo_next:
+	next = _GET_NEXT(fd);
+	/*
+	 * Check that we're not already in the cache, and if not, lock us.
+	 * <= -3 means not in the cache, -2 means locked, -1 means we're
+	 * in the cache, and the last element, >= 0 gives the FD of the next
+	 * in the cache.
+	 */
+	if (next >= -2)
+		goto done;
+	if (!HA_ATOMIC_CAS(&_GET_NEXT(fd), &next, -2))
+		goto redo_next;
+	__ha_barrier_store();
+redo_last:
+	/* First, insert in the linked list */
+	last = list->last;
+	old = -1;
+	new = fd;
+	if (unlikely(last == -1)) {
+		/* list is empty, try to add ourselves alone so that list->last=fd */
+
+		_GET_PREV(fd) = last;
+
+		/* Make sure the "prev" store is visible before we update the last entry */
+		__ha_barrier_store();
+		if (unlikely(!HA_ATOMIC_CAS(&list->last, &old, new)))
+			    goto redo_last;
+
+		/* list->first was necessary -1, we're guaranteed to be alone here */
+		list->first = fd;
+
+		/* since we're alone at the end of the list and still locked(-2),
+		 * we know noone tried to add past us. Mark the end of list.
+		 */
+		_GET_NEXT(fd) = -1;
+		goto done; /* We're done ! */
+	} else {
+		/* non-empty list, add past the tail */
+		do {
+			new = fd;
+			old = -1;
+			_GET_PREV(fd) = last;
+
+			__ha_barrier_store();
+
+			/* adding ourselves past the last element
+			 * The CAS will only succeed if its next is -1,
+			 * which means it's in the cache, and the last element.
+			 */
+			if (likely(HA_ATOMIC_CAS(&_GET_NEXT(last), &old, new)))
+				break;
+			goto redo_last;
+		} while (1);
+	}
+	/* Then, update the last entry */
+redo_fd_cache:
+	last = list->last;
+	__ha_barrier_load();
+
+	if (unlikely(!HA_ATOMIC_CAS(&list->last, &last, fd)))
+		goto redo_fd_cache;
+	__ha_barrier_store();
+	_GET_NEXT(fd) = -1;
+	__ha_barrier_store();
+done:
+	return;
+}
 
 /* Allocates a cache entry for a file descriptor if it does not yet have one.
  * This can be done at any time.
  */
 static inline void fd_alloc_cache_entry(const int fd)
 {
-	HA_RWLOCK_WRLOCK(FDCACHE_LOCK, &fdcache_lock);
-	if (fdtab[fd].cache)
-		goto end;
-	fd_cache_num++;
-	fd_cache_mask |= fdtab[fd].thread_mask;
-	fdtab[fd].cache = fd_cache_num;
-	fd_cache[fd_cache_num-1] = fd;
-  end:
-	HA_RWLOCK_WRUNLOCK(FDCACHE_LOCK, &fdcache_lock);
+	if (!(fdtab[fd].thread_mask & (fdtab[fd].thread_mask - 1)))
+		fd_add_to_fd_list(&fd_cache_local[my_ffsl(fdtab[fd].thread_mask) - 1], fd);
+	else
+		fd_add_to_fd_list(&fd_cache, fd);
+ }
+
+static inline void fd_rm_from_fd_list(volatile struct fdlist *list, int fd)
+{
+#if defined(HA_HAVE_CAS_DW) || defined(HA_CAS_IS_8B)
+	volatile struct fdlist_entry cur_list, next_list;
+#endif
+	int old;
+	int new = -2;
+	volatile int prev;
+	volatile int next;
+	int last;
+
+lock_self:
+#if (defined(HA_CAS_IS_8B) || defined(HA_HAVE_CAS_DW))
+	next_list.next = next_list.prev = -2;
+	cur_list.prev = _GET_PREV(fd);
+	cur_list.next = _GET_NEXT(fd);
+	/* First, attempt to lock our own entries */
+	do {
+		/* The FD is not in the FD cache, give up */
+		if (unlikely(cur_list.next <= -3))
+			return;
+		if (unlikely(cur_list.prev == -2 || cur_list.next == -2))
+			goto lock_self;
+	} while (
+#ifdef HA_CAS_IS_8B
+	    unlikely(!HA_ATOMIC_CAS(((void **)(void *)&_GET_NEXT(fd)), ((void **)(void *)&cur_list), (*(void **)(void *)&next_list))))
+#else
+	    unlikely(!__ha_cas_dw((void *)&_GET_NEXT(fd), (void *)&cur_list, (void *)&next_list)))
+#endif
+	    ;
+	next = cur_list.next;
+	prev = cur_list.prev;
+
+#else
+lock_self_next:
+	next = _GET_NEXT(fd);
+	if (next == -2)
+		goto lock_self_next;
+	if (next <= -3)
+		goto done;
+	if (unlikely(!HA_ATOMIC_CAS(&_GET_NEXT(fd), &next, -2)))
+		goto lock_self_next;
+lock_self_prev:
+	prev = _GET_PREV(fd);
+	if (prev == -2)
+		goto lock_self_prev;
+	if (unlikely(!HA_ATOMIC_CAS(&_GET_PREV(fd), &prev, -2)))
+		goto lock_self_prev;
+#endif
+	__ha_barrier_store();
+
+	/* Now, lock the entries of our neighbours */
+	if (likely(prev != -1)) {
+redo_prev:
+		old = fd;
+
+		if (unlikely(!HA_ATOMIC_CAS(&_GET_NEXT(prev), &old, new))) {
+			if (unlikely(old == -2)) {
+				/* Neighbour already locked, give up and
+				 * retry again once he's done
+				 */
+				_GET_PREV(fd) = prev;
+				__ha_barrier_store();
+				_GET_NEXT(fd) = next;
+				__ha_barrier_store();
+				goto lock_self;
+			}
+			goto redo_prev;
+		}
+	}
+	if (likely(next != -1)) {
+redo_next:
+		old = fd;
+		if (unlikely(!HA_ATOMIC_CAS(&_GET_PREV(next), &old, new))) {
+			if (unlikely(old == -2)) {
+				/* Neighbour already locked, give up and
+				 * retry again once he's done
+				 */
+				if (prev != -1) {
+					_GET_NEXT(prev) = fd;
+					__ha_barrier_store();
+				}
+				_GET_PREV(fd) = prev;
+				__ha_barrier_store();
+				_GET_NEXT(fd) = next;
+				__ha_barrier_store();
+				goto lock_self;
+			}
+			goto redo_next;
+		}
+	}
+	if (list->first == fd)
+		list->first = next;
+	__ha_barrier_store();
+	last = list->last;
+	while (unlikely(last == fd && (!HA_ATOMIC_CAS(&list->last, &last, prev))))
+		__ha_compiler_barrier();
+	/* Make sure we let other threads know we're no longer in cache,
+	 * before releasing our neighbours.
+	 */
+	__ha_barrier_store();
+	if (likely(prev != -1))
+		_GET_NEXT(prev) = next;
+	__ha_barrier_store();
+	if (likely(next != -1))
+		_GET_PREV(next) = prev;
+	__ha_barrier_store();
+	/* Ok, now we're out of the fd cache */
+	_GET_NEXT(fd) = -(next + 4);
+	__ha_barrier_store();
+done:
+	return;
 }
 
+#undef _GET_NEXT
+#undef _GET_PREV
+
+
 /* Removes entry used by fd <fd> from the FD cache and replaces it with the
- * last one. The fdtab.cache is adjusted to match the back reference if needed.
+ * last one.
  * If the fd has no entry assigned, return immediately.
  */
 static inline void fd_release_cache_entry(int fd)
 {
-	unsigned int pos;
-
-	HA_RWLOCK_WRLOCK(FDCACHE_LOCK, &fdcache_lock);
-	pos = fdtab[fd].cache;
-	if (!pos)
-		goto end;
-	fdtab[fd].cache = 0;
-	fd_cache_num--;
-	if (likely(pos <= fd_cache_num)) {
-		/* was not the last entry */
-		fd = fd_cache[fd_cache_num];
-		fd_cache[pos - 1] = fd;
-		fdtab[fd].cache = pos;
-	}
-  end:
-	HA_RWLOCK_WRUNLOCK(FDCACHE_LOCK, &fdcache_lock);
+	if (!(fdtab[fd].thread_mask & (fdtab[fd].thread_mask - 1)))
+		fd_rm_from_fd_list(&fd_cache_local[my_ffsl(fdtab[fd].thread_mask) - 1], fd);
+	else
+		fd_rm_from_fd_list(&fd_cache, fd);
 }
 
 /* Computes the new polled status based on the active and ready statuses, for
@@ -402,7 +582,6 @@
 	fdtab[fd].update_mask &= ~tid_bit;
 	fdtab[fd].linger_risk = 0;
 	fdtab[fd].cloned = 0;
-	fdtab[fd].cache = 0;
 	fdtab[fd].thread_mask = thread_mask;
 	/* note: do not reset polled_mask here as it indicates which poller
 	 * still knows this FD from a possible previous round.