MEDIUM: task: remove the tasks_run_queue counter and have one per thread

This counter is solely used for reporting in the stats and is the hottest
thread contention point to date. Moving it to the scheduler and having a
separate one for the global run queue dramatically improves the performance,
showing a 12% boost on the request rate on 16 threads!

In addition, the thread debugging output which used to rely on rqueue_size
was not totally accurate as it would only report task counts. Now we can
return the exact thread's run queue length.

It is also interesting to note that there are still a few other task/tasklet
counters in the scheduler that are not efficiently updated because some cover
a single area and others cover multiple areas. It looks like having a distinct
counter for each of the following entries would help and would keep the code
a bit cleaner:
  - global run queue (tree)
  - per-thread run queue (tree)
  - per-thread shared tasklets list
  - per-thread local lists

Maybe even splitting the shared tasklets lists between pure tasklets and
tasks instead of having the whole and tasks would simplify the code because
there remain a number of places where several counters have to be updated.
diff --git a/src/debug.c b/src/debug.c
index bf64f19..3162d32 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -175,7 +175,7 @@
 			LIST_ISEMPTY(&task_per_thread[thr].tasklets[TL_BULK]) &&
 			MT_LIST_ISEMPTY(&task_per_thread[thr].shared_tasklet_list)),
 	              task_per_thread[thr].task_list_size,
-	              task_per_thread[thr].rqueue_size,
+	              task_per_thread[thr].rq_total,
 	              stuck,
 	              !!(task_profiling_mask & thr_bit));
 
diff --git a/src/stats.c b/src/stats.c
index a63178d..e124f28 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -3339,7 +3339,7 @@
 	              actconn, pipes_used, pipes_used+pipes_free, read_freq_ctr(&global.conn_per_sec),
 		      bps >= 1000000000UL ? (bps / 1000000000.0) : bps >= 1000000UL ? (bps / 1000000.0) : (bps / 1000.0),
 		      bps >= 1000000000UL ? 'G' : bps >= 1000000UL ? 'M' : 'k',
-	              tasks_run_queue_cur, nb_tasks_cur, ti->idle_pct
+	              total_run_queues(), nb_tasks_cur, ti->idle_pct
 	              );
 
 	/* scope_txt = search query, appctx->ctx.stats.scope_len is always <= STAT_SCOPE_TXT_MAXLEN */
@@ -4366,7 +4366,7 @@
 	info[INF_MAX_ZLIB_MEM_USAGE]             = mkf_u32(FO_CONFIG|FN_LIMIT, global.maxzlibmem);
 #endif
 	info[INF_TASKS]                          = mkf_u32(0, nb_tasks_cur);
-	info[INF_RUN_QUEUE]                      = mkf_u32(0, tasks_run_queue_cur);
+	info[INF_RUN_QUEUE]                      = mkf_u32(0, total_run_queues());
 	info[INF_IDLE_PCT]                       = mkf_u32(FN_AVG, ti->idle_pct);
 	info[INF_NODE]                           = mkf_str(FO_CONFIG|FN_OUTPUT|FS_SERVICE, global.node);
 	if (global.desc)
diff --git a/src/task.c b/src/task.c
index 9c8312c..153f7d6 100644
--- a/src/task.c
+++ b/src/task.c
@@ -37,8 +37,6 @@
 
 unsigned int nb_tasks = 0;
 volatile unsigned long global_tasks_mask = 0; /* Mask of threads with tasks in the global runqueue */
-unsigned int tasks_run_queue = 0;
-unsigned int tasks_run_queue_cur = 0;    /* copy of the run queue size */
 unsigned int nb_tasks_cur = 0;     /* copy of the tasks count */
 unsigned int niced_tasks = 0;      /* number of niced tasks in the run queue */
 
@@ -50,6 +48,7 @@
 #ifdef USE_THREAD
 struct eb_root timers;      /* sorted timers tree, global, accessed under wq_lock */
 struct eb_root rqueue;      /* tree constituting the global run queue, accessed under rq_lock */
+unsigned int grq_total;     /* total number of entries in the global run queue, use grq_lock */
 static unsigned int global_rqueue_ticks;  /* insertion count in the grq, use rq_lock */
 #endif
 
@@ -97,7 +96,7 @@
 			/* Beware: tasks that have never run don't have their ->list empty yet! */
 			MT_LIST_ADDQ(&task_per_thread[thr].shared_tasklet_list,
 			             (struct mt_list *)&((struct tasklet *)t)->list);
-			_HA_ATOMIC_ADD(&tasks_run_queue, 1);
+			_HA_ATOMIC_ADD(&task_per_thread[thr].rq_total, 1);
 			_HA_ATOMIC_ADD(&task_per_thread[thr].task_list_size, 1);
 			if (sleeping_thread_mask & (1UL << thr)) {
 				_HA_ATOMIC_AND(&sleeping_thread_mask, ~(1UL << thr));
@@ -122,19 +121,18 @@
 	if (root == &rqueue) {
 		HA_SPIN_LOCK(TASK_RQ_LOCK, &rq_lock);
 	}
-#endif
-	/* Make sure if the task isn't in the runqueue, nobody inserts it
-	 * in the meanwhile.
-	 */
-	_HA_ATOMIC_ADD(&tasks_run_queue, 1);
-#ifdef USE_THREAD
+
 	if (root == &rqueue) {
 		global_tasks_mask |= t->thread_mask;
+		grq_total++;
 		t->rq.key = ++global_rqueue_ticks;
 		__ha_barrier_store();
 	} else
 #endif
+	{
+		_HA_ATOMIC_ADD(&sched->rq_total, 1);
 		t->rq.key = ++sched->rqueue_ticks;
+	}
 
 	if (likely(t->nice)) {
 		int offset;
@@ -460,7 +458,7 @@
 		t->calls++;
 		sched->current = t;
 
-		_HA_ATOMIC_SUB(&tasks_run_queue, 1);
+		_HA_ATOMIC_SUB(&sched->rq_total, 1);
 
 		if (TASK_IS_TASKLET(t)) {
 			LIST_DEL_INIT(&((struct tasklet *)t)->list);
@@ -595,7 +593,6 @@
 		return;
 	}
 
-	tasks_run_queue_cur = tasks_run_queue; /* keep a copy for reporting */
 	nb_tasks_cur = nb_tasks;
 	max_processed = global.tune.runqueue_depth;
 
@@ -702,7 +699,7 @@
 	if (picked) {
 		tt->tl_class_mask |= 1 << TL_NORMAL;
 		_HA_ATOMIC_ADD(&tt->task_list_size, picked);
-		_HA_ATOMIC_ADD(&tasks_run_queue, picked);
+		_HA_ATOMIC_ADD(&tt->rq_total, picked);
 		activity[tid].tasksw += picked;
 	}