MEDIUM: streams: do not use the streams lock anymore

The lock was still used exclusively to deal with the concurrency between
the "show sess" release handler and a stream_new() or stream_free() on
another thread. All other accesses made by "show sess" are already done
under thread isolation. The release handler only requires to unlink its
node when stopping in the middle of a dump (error, timeout etc). Let's
just isolate the thread to deal with this case so that it's compatible
with the dump conditions, and remove all remaining locking on the streams.

This effectively kills the streams lock. The measured gain here is around
1.6% with 4 threads (374krps -> 380k).
diff --git a/include/haproxy/thread.h b/include/haproxy/thread.h
index 4419c2c..65a27b1 100644
--- a/include/haproxy/thread.h
+++ b/include/haproxy/thread.h
@@ -377,7 +377,6 @@
 	STK_SESS_LOCK,
 	APPLETS_LOCK,
 	PEER_LOCK,
-	STRMS_LOCK,
 	SSL_LOCK,
 	SSL_GEN_CERTS_LOCK,
 	PATREF_LOCK,
@@ -430,7 +429,6 @@
 	case STK_SESS_LOCK:        return "STK_SESS";
 	case APPLETS_LOCK:         return "APPLETS";
 	case PEER_LOCK:            return "PEER";
-	case STRMS_LOCK:           return "STRMS";
 	case SSL_LOCK:             return "SSL";
 	case SSL_GEN_CERTS_LOCK:   return "SSL_GEN_CERTS";
 	case PATREF_LOCK:          return "PATREF";
diff --git a/include/haproxy/tinfo-t.h b/include/haproxy/tinfo-t.h
index 89fde4f..d1a4fc4 100644
--- a/include/haproxy/tinfo-t.h
+++ b/include/haproxy/tinfo-t.h
@@ -46,9 +46,7 @@
 	struct list pool_lru_head;                         /* oldest objects   */
 #endif
 	struct list buffer_wq;     /* buffer waiters */
-
 	struct list streams;       /* list of streams attached to this thread */
-	__decl_thread(HA_SPINLOCK_T streams_lock); /* shared with "show sess" */
 
 	/* pad to cache line (64B) */
 	char __pad[0];            /* unused except to check remaining room */
diff --git a/src/stream.c b/src/stream.c
index 83c9345..b23754b 100644
--- a/src/stream.c
+++ b/src/stream.c
@@ -540,9 +540,7 @@
 
 	s->tunnel_timeout = TICK_ETERNITY;
 
-	HA_SPIN_LOCK(STRMS_LOCK, &ti->streams_lock);
 	LIST_ADDQ(&ti->streams, &s->list);
-	HA_SPIN_UNLOCK(STRMS_LOCK, &ti->streams_lock);
 
 	if (flt_stream_init(s) < 0 || flt_stream_start(s) < 0)
 		goto out_fail_accept;
@@ -711,19 +709,20 @@
 
 	stream_store_counters(s);
 
-	HA_SPIN_LOCK(STRMS_LOCK, &ti->streams_lock);
 	list_for_each_entry_safe(bref, back, &s->back_refs, users) {
 		/* we have to unlink all watchers. We must not relink them if
-		 * this stream was the last one in the list.
+		 * this stream was the last one in the list. This is safe to do
+		 * here because we're touching our thread's list so we know
+		 * that other streams are not active, and the watchers will
+		 * only touch their node under thread isolation.
 		 */
-		LIST_DEL(&bref->users);
-		LIST_INIT(&bref->users);
+		LIST_DEL_INIT(&bref->users);
 		if (s->list.n != &ti->streams)
 			LIST_ADDQ(&LIST_ELEM(s->list.n, struct stream *, list)->back_refs, &bref->users);
 		bref->ref = s->list.n;
+		__ha_barrier_store();
 	}
 	LIST_DEL(&s->list);
-	HA_SPIN_UNLOCK(STRMS_LOCK, &ti->streams_lock);
 
 	/* applets do not release session yet */
 	must_free_sess = objt_appctx(sess->origin) && sess->origin == s->si[0].end;
@@ -3447,10 +3446,15 @@
 static void cli_release_show_sess(struct appctx *appctx)
 {
 	if (appctx->st2 == STAT_ST_LIST && appctx->ctx.sess.thr < global.nbthread) {
-		HA_SPIN_LOCK(STRMS_LOCK, &ha_thread_info[appctx->ctx.sess.thr].streams_lock);
+		/* a dump was aborted, either in error or timeout. We need to
+		 * safely detach from the target stream's list. It's mandatory
+		 * to lock because a stream on the target thread could be moving
+		 * our node.
+		 */
+		thread_isolate();
 		if (!LIST_ISEMPTY(&appctx->ctx.sess.bref.users))
 			LIST_DEL(&appctx->ctx.sess.bref.users);
-		HA_SPIN_UNLOCK(STRMS_LOCK, &ha_thread_info[appctx->ctx.sess.thr].streams_lock);
+		thread_release();
 	}
 }