CLEANUP: task: split the large tasklet_wakeup_on() function in two

This function has become large with the multi-queue scheduler. We need
to keep the fast path and the debugging parts inlined, but the rest now
moves to task.c just like was done for task_wakeup(). This has reduced
the code size by 6kB due to less inlining of large parts that are always
context-dependent, and as a side effect, has increased the overall
performance by 1%.
diff --git a/include/haproxy/task.h b/include/haproxy/task.h
index 018d97e..f1440d0 100644
--- a/include/haproxy/task.h
+++ b/include/haproxy/task.h
@@ -105,6 +105,7 @@
 __decl_thread(extern HA_SPINLOCK_T rq_lock);  /* spin lock related to run queue */
 __decl_thread(extern HA_RWLOCK_T wq_lock);    /* RW lock related to the wait queue */
 
+void __tasklet_wakeup_on(struct tasklet *tl, int thr);
 void task_kill(struct task *t);
 void __task_wakeup(struct task *t);
 void __task_queue(struct task *task, struct eb_root *wq);
@@ -375,36 +376,7 @@
 	tl->debug.caller_file[tl->debug.caller_idx] = file;
 	tl->debug.caller_line[tl->debug.caller_idx] = line;
 #endif
-
-	if (likely(thr < 0)) {
-		/* this tasklet runs on the caller thread */
-		if (tl->state & TASK_SELF_WAKING) {
-			LIST_ADDQ(&sched->tasklets[TL_BULK], &tl->list);
-			sched->tl_class_mask |= 1 << TL_BULK;
-		}
-		else if ((struct task *)tl == sched->current) {
-			_HA_ATOMIC_OR(&tl->state, TASK_SELF_WAKING);
-			LIST_ADDQ(&sched->tasklets[TL_BULK], &tl->list);
-			sched->tl_class_mask |= 1 << TL_BULK;
-		}
-		else if (sched->current_queue < 0) {
-			LIST_ADDQ(&sched->tasklets[TL_URGENT], &tl->list);
-			sched->tl_class_mask |= 1 << TL_URGENT;
-		}
-		else {
-			LIST_ADDQ(&sched->tasklets[sched->current_queue], &tl->list);
-			sched->tl_class_mask |= 1 << sched->current_queue;
-		}
-		_HA_ATOMIC_ADD(&sched->rq_total, 1);
-	} else {
-		/* this tasklet runs on a specific thread. */
-		MT_LIST_ADDQ(&task_per_thread[thr].shared_tasklet_list, (struct mt_list *)&tl->list);
-		_HA_ATOMIC_ADD(&task_per_thread[thr].rq_total, 1);
-		if (sleeping_thread_mask & (1UL << thr)) {
-			_HA_ATOMIC_AND(&sleeping_thread_mask, ~(1UL << thr));
-			wake_thread(thr);
-		}
-	}
+	__tasklet_wakeup_on(tl, thr);
 }
 
 /* schedules tasklet <tl> to run onto the thread designated by tl->tid, which
diff --git a/src/task.c b/src/task.c
index f91cd70..dbc6701 100644
--- a/src/task.c
+++ b/src/task.c
@@ -105,6 +105,44 @@
 	}
 }
 
+/* Do not call this one, please use tasklet_wakeup_on() instead, as this one is
+ * the slow path of tasklet_wakeup_on() which performs some preliminary checks
+ * and sets TASK_IN_LIST before calling this one. A negative <thr> designates
+ * the current thread.
+ */
+void __tasklet_wakeup_on(struct tasklet *tl, int thr)
+{
+	if (likely(thr < 0)) {
+		/* this tasklet runs on the caller thread */
+		if (tl->state & TASK_SELF_WAKING) {
+			LIST_ADDQ(&sched->tasklets[TL_BULK], &tl->list);
+			sched->tl_class_mask |= 1 << TL_BULK;
+		}
+		else if ((struct task *)tl == sched->current) {
+			_HA_ATOMIC_OR(&tl->state, TASK_SELF_WAKING);
+			LIST_ADDQ(&sched->tasklets[TL_BULK], &tl->list);
+			sched->tl_class_mask |= 1 << TL_BULK;
+		}
+		else if (sched->current_queue < 0) {
+			LIST_ADDQ(&sched->tasklets[TL_URGENT], &tl->list);
+			sched->tl_class_mask |= 1 << TL_URGENT;
+		}
+		else {
+			LIST_ADDQ(&sched->tasklets[sched->current_queue], &tl->list);
+			sched->tl_class_mask |= 1 << sched->current_queue;
+		}
+		_HA_ATOMIC_ADD(&sched->rq_total, 1);
+	} else {
+		/* this tasklet runs on a specific thread. */
+		MT_LIST_ADDQ(&task_per_thread[thr].shared_tasklet_list, (struct mt_list *)&tl->list);
+		_HA_ATOMIC_ADD(&task_per_thread[thr].rq_total, 1);
+		if (sleeping_thread_mask & (1UL << thr)) {
+			_HA_ATOMIC_AND(&sleeping_thread_mask, ~(1UL << thr));
+			wake_thread(thr);
+		}
+	}
+}
+
 /* Puts the task <t> in run queue at a position depending on t->nice. <t> is
  * returned. The nice value assigns boosts in 32th of the run queue size. A
  * nice value of -1024 sets the task to -tasks_run_queue*32, while a nice value