MINOR: listener: always compare the local thread as well

By comparing the local thread's load with the least loaded thread's
load, we can further improve the fairness and at the same time also
improve locality since it allows a small ratio of connections not to
be migrated. This is visible on CPU usage with long connections on
very large thread counts (224) and high bandwidth (200G). The cost
of checking the local thread's load remains fairly low so there's no
reason not to do this. We continue to update the index if we select
the local thread, because it means that the two other threads were
both more loaded so we'd rather find better ones.
diff --git a/src/listener.c b/src/listener.c
index 0385094..a68cb12 100644
--- a/src/listener.c
+++ b/src/listener.c
@@ -1229,7 +1229,7 @@
 			 */
 			thr_idx_ptr = l->rx.shard_info ? &((struct listener *)(l->rx.shard_info->ref->owner))->thr_idx : &l->thr_idx;
 			while (1) {
-				int q1, q2;
+				int q0, q1, q2;
 
 				/* calculate r1/g1/t1 first (ascending idx) */
 				n0 = _HA_ATOMIC_LOAD(thr_idx_ptr);
@@ -1337,12 +1337,15 @@
 
 				/* here we have (r1,g1,t1) that designate the first receiver, its
 				 * thread group and local thread, and (r2,g2,t2) that designate
-				 * the second receiver, its thread group and local thread.
+				 * the second receiver, its thread group and local thread. We'll
+				 * also consider the local thread with q0.
 				 */
+				q0 = accept_queue_ring_len(&accept_queue_rings[tid]);
 				q1 = accept_queue_ring_len(&accept_queue_rings[g1->base + t1]);
 				q2 = accept_queue_ring_len(&accept_queue_rings[g2->base + t2]);
 
 				/* add to this the currently active connections */
+				q0 += _HA_ATOMIC_LOAD(&l->thr_conn[ti->ltid]);
 				if (l->rx.shard_info) {
 					q1 += _HA_ATOMIC_LOAD(&((struct listener *)l->rx.shard_info->members[r1]->owner)->thr_conn[t1]);
 					q2 += _HA_ATOMIC_LOAD(&((struct listener *)l->rx.shard_info->members[r2]->owner)->thr_conn[t2]);
@@ -1361,12 +1364,17 @@
 				 *   q1 = q2 : both are equally loaded, thus we pick t1
 				 *             and update t1 as it will become more loaded
 				 *             than t2.
+				 * On top of that, if in the end the current thread appears
+				 * to be as good of a deal, we'll prefer it over a foreign
+				 * one as it will improve locality and avoid a migration.
 				 */
 
 				if (q1 - q2 < 0) {
 					t = g1->base + t1;
+					if (q0 <= q1)
+						t = tid;
 
-					if (l->rx.shard_info)
+					if (l->rx.shard_info && t != tid)
 						new_li = l->rx.shard_info->members[r1]->owner;
 
 					t2--;
@@ -1378,15 +1386,19 @@
 				}
 				else if (q1 - q2 > 0) {
 					t = g2->base + t2;
+					if (q0 <= q2)
+						t = tid;
 
-					if (l->rx.shard_info)
+					if (l->rx.shard_info && t != tid)
 						new_li = l->rx.shard_info->members[r2]->owner;
 					goto updt_t1;
 				}
-				else {
+				else { // q1 == q2
 					t = g1->base + t1;
+					if (q0 < q1) // local must be strictly better than both
+						t = tid;
 
-					if (l->rx.shard_info)
+					if (l->rx.shard_info && t != tid)
 						new_li = l->rx.shard_info->members[r1]->owner;
 				updt_t1:
 					t1++;