MINOR: sched: have one runqueue ticks counter per thread

The runqueue_ticks counts the number of task wakeups and is used to
position new tasks in the run queue, but since we've had per-thread
run queues, the values there are not very relevant anymore and the
nice value doesn't apply well if some threads are more loaded than
others. In addition, letting all threads compete over a shared counter
is not smart as this may cause some excessive contention.

Let's move this index close to the run queues themselves, i.e. one per
thread and a global one. In addition to improving fairness, this has
increased global performance by 2% on 16 threads thanks to the lower
contention on rqueue_ticks.

Fairness issues were not observed, but if any were to be, this patch
could be backported as far as 2.0 to address them.
diff --git a/src/task.c b/src/task.c
index 31f3102..9c8312c 100644
--- a/src/task.c
+++ b/src/task.c
@@ -48,11 +48,11 @@
 __decl_aligned_rwlock(wq_lock);   /* RW lock related to the wait queue */
 
 #ifdef USE_THREAD
-struct eb_root timers;      /* sorted timers tree, global */
-struct eb_root rqueue;      /* tree constituting the run queue */
+struct eb_root timers;      /* sorted timers tree, global, accessed under wq_lock */
+struct eb_root rqueue;      /* tree constituting the global run queue, accessed under rq_lock */
+static unsigned int global_rqueue_ticks;  /* insertion count in the grq, use rq_lock */
 #endif
 
-static unsigned int rqueue_ticks;  /* insertion count */
 
 struct task_per_thread task_per_thread[MAX_THREADS];
 
@@ -130,10 +130,11 @@
 #ifdef USE_THREAD
 	if (root == &rqueue) {
 		global_tasks_mask |= t->thread_mask;
+		t->rq.key = ++global_rqueue_ticks;
 		__ha_barrier_store();
-	}
+	} else
 #endif
-	t->rq.key = _HA_ATOMIC_ADD(&rqueue_ticks, 1);
+		t->rq.key = ++sched->rqueue_ticks;
 
 	if (likely(t->nice)) {
 		int offset;
@@ -643,7 +644,7 @@
 		if ((global_tasks_mask & tid_bit) && !grq) {
 #ifdef USE_THREAD
 			HA_SPIN_LOCK(TASK_RQ_LOCK, &rq_lock);
-			grq = eb32sc_lookup_ge(&rqueue, rqueue_ticks - TIMER_LOOK_BACK, tid_bit);
+			grq = eb32sc_lookup_ge(&rqueue, global_rqueue_ticks - TIMER_LOOK_BACK, tid_bit);
 			if (unlikely(!grq)) {
 				grq = eb32sc_first(&rqueue, tid_bit);
 				if (!grq) {
@@ -659,7 +660,7 @@
 		 */
 
 		if (!lrq) {
-			lrq = eb32sc_lookup_ge(&tt->rqueue, rqueue_ticks - TIMER_LOOK_BACK, tid_bit);
+			lrq = eb32sc_lookup_ge(&tt->rqueue, tt->rqueue_ticks - TIMER_LOOK_BACK, tid_bit);
 			if (unlikely(!lrq))
 				lrq = eb32sc_first(&tt->rqueue, tid_bit);
 		}