MEDIUM: tasks: apply a fair CPU distribution between tasklet classes

Till now in process_runnable_tasks() we used to reserve a fixed portion
of max_processed to urgent tasks, then a portion of what remains for
normal tasks, then what remains for bulk tasks. This causes two issues:

  - the current budget for processed tasks could be drained once for
    all by higher level tasks so that they couldn't have enough left
    for the next run. For example, if bulk tasklets cause task wakeups,
    the required share to run them could be eaten by other bulk tasklets.

  - it forces the urgent tasks to be run before scanning the tree so that
    we know how many tasks to pick from the tree, and this isn't very
    efficient cache-wise.

This patch changes this so that we compute upfront how max_processed will
be shared between classes that require so. We can then decide in advance
to pick a certain number of tasks from the tree, then execute all tasklets
in turn. When reaching the end, if there's still some budget, we can go
back and do the same thing again, improving chances to pick new work
before the global budget is depleted.

The default weights have been set to 50% for urgent tasklets, 37% for
normal ones and 13% for the bulk ones. In practice, there are not that
many urgent tasklets but when they appear they are cheap and must be
processed in as large batches as possible. Every time there is nothing
to pick there, the unused budget is shared between normal and bulk and
this allows bulk tasklets to still have quite some CPU to run on.
diff --git a/src/task.c b/src/task.c
index 1f7fd53..ed0416c 100644
--- a/src/task.c
+++ b/src/task.c
@@ -422,14 +422,27 @@
 void process_runnable_tasks()
 {
 	struct task_per_thread * const tt = sched;
-	struct eb32sc_node *lrq = NULL; // next local run queue entry
-	struct eb32sc_node *grq = NULL; // next global run queue entry
+	struct eb32sc_node *lrq; // next local run queue entry
+	struct eb32sc_node *grq; // next global run queue entry
 	struct task *t;
-	int max_processed, done;
+	const unsigned int default_weights[TL_CLASSES] = {
+		[TL_URGENT] = 64, // ~50% of CPU bandwidth for I/O
+		[TL_NORMAL] = 48, // ~37% of CPU bandwidth for tasks
+		[TL_BULK]   = 16, // ~13% of CPU bandwidth for self-wakers
+	};
+	unsigned int max[TL_CLASSES]; // max to be run per class
+	unsigned int max_total;       // sum of max above
 	struct mt_list *tmp_list;
+	unsigned int queue;
+	int max_processed;
 
 	ti->flags &= ~TI_FL_STUCK; // this thread is still running
 
+	if (!thread_has_tasks()) {
+		activity[tid].empty_rq++;
+		return;
+	}
+
 	tasks_run_queue_cur = tasks_run_queue; /* keep a copy for reporting */
 	nb_tasks_cur = nb_tasks;
 	max_processed = global.tune.runqueue_depth;
@@ -437,30 +450,37 @@
 	if (likely(niced_tasks))
 		max_processed = (max_processed + 3) / 4;
 
-	if (!thread_has_tasks()) {
-		activity[tid].empty_rq++;
-		return;
-	}
-
  not_done_yet:
-	/* Merge the list of tasklets waken up by other threads to the
-	 * main list.
-	 */
-	tmp_list = MT_LIST_BEHEAD(&sched->shared_tasklet_list);
-	if (tmp_list)
-		LIST_SPLICE_END_DETACHED(&sched->tasklets[TL_URGENT], (struct list *)tmp_list);
+	max[TL_URGENT] = max[TL_NORMAL] = max[TL_BULK] = 0;
 
-	/* run up to max_processed/3 urgent tasklets */
-	if (max_processed > 0) {
-		done = run_tasks_from_list(&tt->tasklets[TL_URGENT], (max_processed + 2) / 3);
-		max_processed -= done;
-	}
+	/* urgent tasklets list gets a default weight of ~50% */
+	if (!LIST_ISEMPTY(&tt->tasklets[TL_URGENT]) ||
+	    !MT_LIST_ISEMPTY(&tt->shared_tasklet_list))
+		max[TL_URGENT] = default_weights[TL_URGENT];
 
-	/* pick up to max_processed/2 (~=3/4*(max_processed-done)) regular tasks from prio-ordered run queues */
+	/* normal tasklets list gets a default weight of ~37% */
+	if (!LIST_ISEMPTY(&tt->tasklets[TL_NORMAL]) ||
+	    (sched->rqueue_size > 0) || (global_tasks_mask & tid_bit))
+		max[TL_NORMAL] = default_weights[TL_NORMAL];
 
-	/* Note: the grq lock is always held when grq is not null */
+	/* bulk tasklets list gets a default weight of ~13% */
+	if (!LIST_ISEMPTY(&tt->tasklets[TL_BULK]))
+		max[TL_BULK] = default_weights[TL_BULK];
+
+	/* Now compute a fair share of the weights. Total may slightly exceed
+	 * 100% due to rounding, this is not a problem. Note that by design
+	 * the sum cannot be NULL as we cannot get there without tasklets to
+	 * process.
+	 */
+	max_total = max[TL_URGENT] + max[TL_NORMAL] + max[TL_BULK];
+	for (queue = 0; queue < TL_CLASSES; queue++)
+		max[queue]  = ((unsigned)max_processed * max[queue] + max_total - 1) / max_total;
+
+	lrq = grq = NULL;
 
-	while (tt->task_list_size < (3 * max_processed + 3) / 4) {
+	/* pick up to max[TL_NORMAL] regular tasks from prio-ordered run queues */
+	/* Note: the grq lock is always held when grq is not null */
+	while (tt->task_list_size < max[TL_NORMAL]) {
 		if ((global_tasks_mask & tid_bit) && !grq) {
 #ifdef USE_THREAD
 			HA_SPIN_LOCK(TASK_RQ_LOCK, &rq_lock);
@@ -522,16 +542,17 @@
 		grq = NULL;
 	}
 
-	/* run between 0.4*max_processed and max_processed/2 regular tasks */
-	if (max_processed > 0) {
-		done = run_tasks_from_list(&tt->tasklets[TL_NORMAL], (3 * max_processed + 3) / 4);
-		max_processed -= done;
-	}
+	/* Merge the list of tasklets waken up by other threads to the
+	 * main list.
+	 */
+	tmp_list = MT_LIST_BEHEAD(&tt->shared_tasklet_list);
+	if (tmp_list)
+		LIST_SPLICE_END_DETACHED(&tt->tasklets[TL_URGENT], (struct list *)tmp_list);
 
-	/* run between max_processed/4 and max_processed bulk tasklets */
-	if (max_processed > 0) {
-		done = run_tasks_from_list(&tt->tasklets[TL_BULK], max_processed);
-		max_processed -= done;
+	/* execute tasklets in each queue */
+	for (queue = 0; queue < TL_CLASSES; queue++) {
+		if (max[queue] > 0)
+			max_processed -= run_tasks_from_list(&tt->tasklets[queue], max[queue]);
 	}
 
 	/* some tasks may have woken other ones up */