MAJOR: fd: grab the tgid before manipulating running We now grab a reference to the FD's tgid before manipulating the running_mask so that we're certain it corresponds to our own group (hence bits), and we drop it once we've set the bit. For now there's no measurable performance impact in doing this, which is great. The lock can be observed by perf top as taking a small share of the time spent in fd_update_events(), itself taking no more than 0.28% of CPU under 8 threads. However due to the fact that the thread groups are not yet properly spread across the pollers and the thread masks are still wrong, this will trigger some BUG_ON() in fd_insert() after a few tens of thousands of connections when threads other than those of group 1 are reached, and this is expected.

commit: 0dc1cc93b669df7674ec9036f217db1bceb68b01 [log] [tgz]
author: Willy Tarreau <w@1wt.eu> Wed Jul 06 18:47:38 2022 +0200
committer: Willy Tarreau <w@1wt.eu> Fri Jul 15 20:16:30 2022 +0200
tree: b59f69953226210dabcc9a0c8c45a8e94172e5a1
parent: ceffd17f52b6b1aa481365fc6f9b88e8efc436e8 [diff] [blame]
diff --git a/src/fd.c b/src/fd.c
index f93c117..6a7043c 100644
--- a/src/fd.c
+++ b/src/fd.c

@@ -355,6 +355,11 @@
 	 */
 	BUG_ON(fd < 0 || fd >= global.maxsock);
 
+	/* the tgid cannot change before a complete close so we should never
+	 * face the situation where we try to close an fd that was reassigned.
+	 */
+	BUG_ON(fd_tgid(fd) != ti->tgid && !thread_isolated());
+
 	/* we must postpone removal of an FD that may currently be in use
 	 * by another thread. This can happen in the following two situations:
 	 *   - after a takeover, the owning thread closes the connection but
@@ -422,11 +427,17 @@
 	/* we must be alone to work on this idle FD. If not, it means that its
 	 * poller is currently waking up and is about to use it, likely to
 	 * close it on shut/error, but maybe also to process any unexpectedly
-	 * pending data.
+	 * pending data. It's also possible that the FD was closed and
+	 * reassigned to another thread group, so let's be careful.
 	 */
+	if (unlikely(!fd_grab_tgid(fd, ti->tgid)))
+		return -1;
+
 	old = 0;
-	if (!HA_ATOMIC_CAS(&fdtab[fd].running_mask, &old, tid_bit))
+	if (!HA_ATOMIC_CAS(&fdtab[fd].running_mask, &old, tid_bit)) {
+		fd_drop_tgid(fd);
 		return -1;
+	}
 
 	/* success, from now on it's ours */
 	HA_ATOMIC_STORE(&fdtab[fd].thread_mask, tid_bit);
@@ -439,6 +450,9 @@
 
 	/* we're done with it */
 	HA_ATOMIC_AND(&fdtab[fd].running_mask, ~tid_bit);
+
+	/* no more changes planned */
+	fd_drop_tgid(fd);
 	return 0;
 }
 
@@ -484,6 +498,17 @@
 
 	_HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_STUCK); // this thread is still running
 
+	if (unlikely(!fd_grab_tgid(fd, ti->tgid))) {
+		/* the FD changed to another tgid, we can't safely
+		 * check it anymore. The bits in the masks are not
+		 * ours anymore and we're not allowed to touch them.
+		 * Ours have already been cleared and the FD was
+		 * closed in between so we can safely leave now.
+		 */
+		activity[tid].poll_drop_fd++;
+		return FD_UPDT_CLOSED;
+	}
+
 	/* do nothing if the FD was taken over under us */
 	do {
 		/* make sure we read a synchronous copy of rmask and tmask
@@ -502,10 +527,15 @@
 			/* Let the poller know this FD was lost */
 			if (!HA_ATOMIC_BTS(&fdtab[fd].update_mask, tid))
 				fd_updt[fd_nbupdt++] = fd;
+
+			fd_drop_tgid(fd);
 			return FD_UPDT_MIGRATED;
 		}
 	} while (!HA_ATOMIC_CAS(&fdtab[fd].running_mask, &rmask, rmask | tid_bit));
 
+	/* with running we're safe now, we can drop the reference */
+	fd_drop_tgid(fd);
+
 	locked = (tmask != tid_bit);
 
 	/* OK now we are guaranteed that our thread_mask was present and
commit	0dc1cc93b669df7674ec9036f217db1bceb68b01	[log] [tgz]
author	Willy Tarreau <w@1wt.eu>	Wed Jul 06 18:47:38 2022 +0200
committer	Willy Tarreau <w@1wt.eu>	Fri Jul 15 20:16:30 2022 +0200
tree	b59f69953226210dabcc9a0c8c45a8e94172e5a1
parent	ceffd17f52b6b1aa481365fc6f9b88e8efc436e8 [diff] [blame]