MEDIUM: tasks: apply a fair CPU distribution between tasklet classes Till now in process_runnable_tasks() we used to reserve a fixed portion of max_processed to urgent tasks, then a portion of what remains for normal tasks, then what remains for bulk tasks. This causes two issues: - the current budget for processed tasks could be drained once for all by higher level tasks so that they couldn't have enough left for the next run. For example, if bulk tasklets cause task wakeups, the required share to run them could be eaten by other bulk tasklets. - it forces the urgent tasks to be run before scanning the tree so that we know how many tasks to pick from the tree, and this isn't very efficient cache-wise. This patch changes this so that we compute upfront how max_processed will be shared between classes that require so. We can then decide in advance to pick a certain number of tasks from the tree, then execute all tasklets in turn. When reaching the end, if there's still some budget, we can go back and do the same thing again, improving chances to pick new work before the global budget is depleted. The default weights have been set to 50% for urgent tasklets, 37% for normal ones and 13% for the bulk ones. In practice, there are not that many urgent tasklets but when they appear they are cheap and must be processed in as large batches as possible. Every time there is nothing to pick there, the unused budget is shared between normal and bulk and this allows bulk tasklets to still have quite some CPU to run on.

commit: 3ef7a190b0d683aaea95980d69ca1803231b3140 [log] [tgz]
author: Willy Tarreau <w@1wt.eu> Wed Jun 24 07:21:08 2020 +0200
committer: Willy Tarreau <w@1wt.eu> Wed Jun 24 12:21:26 2020 +0200
tree: 173582527188927a501a5ed580fbfec712dfc969
parent: b24086923c48767503a2d5d128e922794cd3e4c5 [diff] [blame]
diff --git a/src/task.c b/src/task.c
index 1f7fd53..ed0416c 100644
--- a/src/task.c
+++ b/src/task.c

@@ -422,14 +422,27 @@
 void process_runnable_tasks()
 {
 	struct task_per_thread * const tt = sched;
-	struct eb32sc_node *lrq = NULL; // next local run queue entry
-	struct eb32sc_node *grq = NULL; // next global run queue entry
+	struct eb32sc_node *lrq; // next local run queue entry
+	struct eb32sc_node *grq; // next global run queue entry
 	struct task *t;
-	int max_processed, done;
+	const unsigned int default_weights[TL_CLASSES] = {
+		[TL_URGENT] = 64, // ~50% of CPU bandwidth for I/O
+		[TL_NORMAL] = 48, // ~37% of CPU bandwidth for tasks
+		[TL_BULK]   = 16, // ~13% of CPU bandwidth for self-wakers
+	};
+	unsigned int max[TL_CLASSES]; // max to be run per class
+	unsigned int max_total;       // sum of max above
 	struct mt_list *tmp_list;
+	unsigned int queue;
+	int max_processed;
 
 	ti->flags &= ~TI_FL_STUCK; // this thread is still running
 
+	if (!thread_has_tasks()) {
+		activity[tid].empty_rq++;
+		return;
+	}
+
 	tasks_run_queue_cur = tasks_run_queue; /* keep a copy for reporting */
 	nb_tasks_cur = nb_tasks;
 	max_processed = global.tune.runqueue_depth;
@@ -437,30 +450,37 @@
 	if (likely(niced_tasks))
 		max_processed = (max_processed + 3) / 4;
 
-	if (!thread_has_tasks()) {
-		activity[tid].empty_rq++;
-		return;
-	}
-
  not_done_yet:
-	/* Merge the list of tasklets waken up by other threads to the
-	 * main list.
-	 */
-	tmp_list = MT_LIST_BEHEAD(&sched->shared_tasklet_list);
-	if (tmp_list)
-		LIST_SPLICE_END_DETACHED(&sched->tasklets[TL_URGENT], (struct list *)tmp_list);
+	max[TL_URGENT] = max[TL_NORMAL] = max[TL_BULK] = 0;
 
-	/* run up to max_processed/3 urgent tasklets */
-	if (max_processed > 0) {
-		done = run_tasks_from_list(&tt->tasklets[TL_URGENT], (max_processed + 2) / 3);
-		max_processed -= done;
-	}
+	/* urgent tasklets list gets a default weight of ~50% */
+	if (!LIST_ISEMPTY(&tt->tasklets[TL_URGENT]) ||
+	    !MT_LIST_ISEMPTY(&tt->shared_tasklet_list))
+		max[TL_URGENT] = default_weights[TL_URGENT];
 
-	/* pick up to max_processed/2 (~=3/4*(max_processed-done)) regular tasks from prio-ordered run queues */
+	/* normal tasklets list gets a default weight of ~37% */
+	if (!LIST_ISEMPTY(&tt->tasklets[TL_NORMAL]) ||
+	    (sched->rqueue_size > 0) || (global_tasks_mask & tid_bit))
+		max[TL_NORMAL] = default_weights[TL_NORMAL];
 
-	/* Note: the grq lock is always held when grq is not null */
+	/* bulk tasklets list gets a default weight of ~13% */
+	if (!LIST_ISEMPTY(&tt->tasklets[TL_BULK]))
+		max[TL_BULK] = default_weights[TL_BULK];
+
+	/* Now compute a fair share of the weights. Total may slightly exceed
+	 * 100% due to rounding, this is not a problem. Note that by design
+	 * the sum cannot be NULL as we cannot get there without tasklets to
+	 * process.
+	 */
+	max_total = max[TL_URGENT] + max[TL_NORMAL] + max[TL_BULK];
+	for (queue = 0; queue < TL_CLASSES; queue++)
+		max[queue]  = ((unsigned)max_processed * max[queue] + max_total - 1) / max_total;
+
+	lrq = grq = NULL;
 
-	while (tt->task_list_size < (3 * max_processed + 3) / 4) {
+	/* pick up to max[TL_NORMAL] regular tasks from prio-ordered run queues */
+	/* Note: the grq lock is always held when grq is not null */
+	while (tt->task_list_size < max[TL_NORMAL]) {
 		if ((global_tasks_mask & tid_bit) && !grq) {
 #ifdef USE_THREAD
 			HA_SPIN_LOCK(TASK_RQ_LOCK, &rq_lock);
@@ -522,16 +542,17 @@
 		grq = NULL;
 	}
 
-	/* run between 0.4*max_processed and max_processed/2 regular tasks */
-	if (max_processed > 0) {
-		done = run_tasks_from_list(&tt->tasklets[TL_NORMAL], (3 * max_processed + 3) / 4);
-		max_processed -= done;
-	}
+	/* Merge the list of tasklets waken up by other threads to the
+	 * main list.
+	 */
+	tmp_list = MT_LIST_BEHEAD(&tt->shared_tasklet_list);
+	if (tmp_list)
+		LIST_SPLICE_END_DETACHED(&tt->tasklets[TL_URGENT], (struct list *)tmp_list);
 
-	/* run between max_processed/4 and max_processed bulk tasklets */
-	if (max_processed > 0) {
-		done = run_tasks_from_list(&tt->tasklets[TL_BULK], max_processed);
-		max_processed -= done;
+	/* execute tasklets in each queue */
+	for (queue = 0; queue < TL_CLASSES; queue++) {
+		if (max[queue] > 0)
+			max_processed -= run_tasks_from_list(&tt->tasklets[queue], max[queue]);
 	}
 
 	/* some tasks may have woken other ones up */
commit	3ef7a190b0d683aaea95980d69ca1803231b3140	[log] [tgz]
author	Willy Tarreau <w@1wt.eu>	Wed Jun 24 07:21:08 2020 +0200
committer	Willy Tarreau <w@1wt.eu>	Wed Jun 24 12:21:26 2020 +0200
tree	173582527188927a501a5ed580fbfec712dfc969
parent	b24086923c48767503a2d5d128e922794cd3e4c5 [diff] [blame]