MEDIUM: fd/threads: only grab the fd's lock if the FD has more than one thread

The vast majority of FDs are only seen by one thread. Currently the lock
on FDs costs a lot because it's touched often, though there should be very
little contention. This patch ensures that the lock is only grabbed if the
FD is shared by more than one thread, since otherwise the situation is safe.
Doing so resulted in a 15% performance boost on a 12-threads test.
diff --git a/src/fd.c b/src/fd.c
index 1a5419d..c467489 100644
--- a/src/fd.c
+++ b/src/fd.c
@@ -359,7 +359,10 @@
  */
 static void fd_dodelete(int fd, int do_close)
 {
-	HA_SPIN_LOCK(FD_LOCK, &fdtab[fd].lock);
+	unsigned long locked = atleast2(fdtab[fd].thread_mask);
+
+	if (locked)
+		HA_SPIN_LOCK(FD_LOCK, &fdtab[fd].lock);
 	if (fdtab[fd].linger_risk) {
 		/* this is generally set when connecting to servers */
 		setsockopt(fd, SOL_SOCKET, SO_LINGER,
@@ -379,7 +382,8 @@
 		polled_mask[fd] = 0;
 		close(fd);
 	}
-	HA_SPIN_UNLOCK(FD_LOCK, &fdtab[fd].lock);
+	if (locked)
+		HA_SPIN_UNLOCK(FD_LOCK, &fdtab[fd].lock);
 }
 
 /* Deletes an FD from the fdsets.
@@ -417,7 +421,7 @@
 			continue;
 
 		HA_ATOMIC_OR(&fd_cache_mask, tid_bit);
-		if (HA_SPIN_TRYLOCK(FD_LOCK, &fdtab[fd].lock)) {
+		if (atleast2(fdtab[fd].thread_mask) && HA_SPIN_TRYLOCK(FD_LOCK, &fdtab[fd].lock)) {
 			activity[tid].fd_lock++;
 			continue;
 		}
@@ -432,12 +436,14 @@
 			fdtab[fd].ev |= FD_POLL_OUT;
 
 		if (fdtab[fd].iocb && fdtab[fd].owner && fdtab[fd].ev) {
-			HA_SPIN_UNLOCK(FD_LOCK, &fdtab[fd].lock);
+			if (atleast2(fdtab[fd].thread_mask))
+				HA_SPIN_UNLOCK(FD_LOCK, &fdtab[fd].lock);
 			fdtab[fd].iocb(fd);
 		}
 		else {
 			fd_release_cache_entry(fd);
-			HA_SPIN_UNLOCK(FD_LOCK, &fdtab[fd].lock);
+			if (atleast2(fdtab[fd].thread_mask))
+				HA_SPIN_UNLOCK(FD_LOCK, &fdtab[fd].lock);
 		}
 	}
 }