MINOR: task: move the allocated tasks counter to the per-thread struct

The nb_tasks counter was still global and gets incremented and decremented
for each task_new()/task_free(), and was read in process_runnable_tasks().
But it's only used for stats reporting, so doing this this often is
pointless and expensive. Let's move it to the task_per_thread struct and
have the stats sum it when needed.
diff --git a/include/haproxy/task-t.h b/include/haproxy/task-t.h
index b2a69ee..3df4bf6 100644
--- a/include/haproxy/task-t.h
+++ b/include/haproxy/task-t.h
@@ -81,6 +81,7 @@
 	int current_queue;      /* points to current tasklet list being run, -1 if none */
 	unsigned int rq_total;  /* total size of the run queue, prio_tree + tasklets */
 	struct task *current;   /* current task (not tasklet) */
+	unsigned int nb_tasks;  /* number of tasks allocated on this thread */
 	uint8_t tl_class_mask;  /* bit mask of non-empty tasklets classes */
 	__attribute__((aligned(64))) char end[0];
 };
diff --git a/include/haproxy/task.h b/include/haproxy/task.h
index 99171e1..018d97e 100644
--- a/include/haproxy/task.h
+++ b/include/haproxy/task.h
@@ -87,10 +87,8 @@
 
 
 /* a few exported variables */
-extern unsigned int nb_tasks;     /* total number of tasks */
 extern volatile unsigned long global_tasks_mask; /* Mask of threads with tasks in the global runqueue */
 extern unsigned int grq_total;    /* total number of entries in the global run queue */
-extern unsigned int nb_tasks_cur;
 extern unsigned int niced_tasks;  /* number of niced tasks in the run queue */
 extern struct pool_head *pool_head_task;
 extern struct pool_head *pool_head_tasklet;
@@ -160,6 +158,19 @@
 	return ret;
 }
 
+/* returns the number of allocated tasks across all threads. Note that this
+ * *is* racy since some threads might be updating their counts while we're
+ * looking, but this is only for statistics reporting.
+ */
+static inline int total_allocated_tasks()
+{
+	int thr, ret;
+
+	for (thr = ret = 0; thr < global.nbthread; thr++)
+		ret += _HA_ATOMIC_LOAD(&task_per_thread[thr].nb_tasks);
+	return ret;
+}
+
 /* return 0 if task is in run queue, otherwise non-zero */
 static inline int task_in_rq(struct task *t)
 {
@@ -496,7 +507,7 @@
 {
 	struct task *t = pool_alloc(pool_head_task);
 	if (t) {
-		_HA_ATOMIC_ADD(&nb_tasks, 1);
+		sched->nb_tasks++;
 		task_init(t, thread_mask);
 	}
 	return t;
@@ -521,9 +532,9 @@
 #endif
 
 	pool_free(pool_head_task, t);
+	sched->nb_tasks--;
 	if (unlikely(stopping))
 		pool_flush(pool_head_task);
-	_HA_ATOMIC_SUB(&nb_tasks, 1);
 }
 
 /* Destroys a task : it's unlinked from the wait queues and is freed if it's
diff --git a/src/stats.c b/src/stats.c
index e124f28..53faee4 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -3339,7 +3339,7 @@
 	              actconn, pipes_used, pipes_used+pipes_free, read_freq_ctr(&global.conn_per_sec),
 		      bps >= 1000000000UL ? (bps / 1000000000.0) : bps >= 1000000UL ? (bps / 1000000.0) : (bps / 1000.0),
 		      bps >= 1000000000UL ? 'G' : bps >= 1000000UL ? 'M' : 'k',
-	              total_run_queues(), nb_tasks_cur, ti->idle_pct
+	              total_run_queues(), total_allocated_tasks(), ti->idle_pct
 	              );
 
 	/* scope_txt = search query, appctx->ctx.stats.scope_len is always <= STAT_SCOPE_TXT_MAXLEN */
@@ -4365,7 +4365,7 @@
 	info[INF_ZLIB_MEM_USAGE]                 = mkf_u32(0, zlib_used_memory);
 	info[INF_MAX_ZLIB_MEM_USAGE]             = mkf_u32(FO_CONFIG|FN_LIMIT, global.maxzlibmem);
 #endif
-	info[INF_TASKS]                          = mkf_u32(0, nb_tasks_cur);
+	info[INF_TASKS]                          = mkf_u32(0, total_allocated_tasks());
 	info[INF_RUN_QUEUE]                      = mkf_u32(0, total_run_queues());
 	info[INF_IDLE_PCT]                       = mkf_u32(FN_AVG, ti->idle_pct);
 	info[INF_NODE]                           = mkf_str(FO_CONFIG|FN_OUTPUT|FS_SERVICE, global.node);
diff --git a/src/task.c b/src/task.c
index 32c17b0..f91cd70 100644
--- a/src/task.c
+++ b/src/task.c
@@ -35,9 +35,7 @@
  */
 DECLARE_POOL(pool_head_notification, "notification", sizeof(struct notification));
 
-unsigned int nb_tasks = 0;
 volatile unsigned long global_tasks_mask = 0; /* Mask of threads with tasks in the global runqueue */
-unsigned int nb_tasks_cur = 0;     /* copy of the tasks count */
 unsigned int niced_tasks = 0;      /* number of niced tasks in the run queue */
 
 THREAD_LOCAL struct task_per_thread *sched = &task_per_thread[0]; /* scheduler context for the current thread */
@@ -591,7 +589,6 @@
 		return;
 	}
 
-	nb_tasks_cur = nb_tasks;
 	max_processed = global.tune.runqueue_depth;
 
 	if (likely(niced_tasks))