[OPTIM] task: reduce the number of calls to task_queue()

Most of the time, task_queue() will immediately return. By extracting
the preliminary checks and putting them in an inline function, we can
significantly reduce the number of calls to the function itself, and
most of the tests can be optimized away due to the caller's context.

Another minor improvement in process_runnable_tasks() consisted in
taking benefit from the processor's branch prediction unit by making
a special case of the process_session() callback which is by far the
most common one.

All this improved performance by about 1%, mainly during the call
from process_runnable_tasks().
diff --git a/include/common/ticks.h b/include/common/ticks.h
index f3c1a7d..4587d56 100644
--- a/include/common/ticks.h
+++ b/include/common/ticks.h
@@ -113,6 +113,17 @@
 		return t2;
 }
 
+/* return the first one of the two timers, where only the first one may be infinite */
+static inline int tick_first_2nz(int t1, int t2)
+{
+	if (!tick_isset(t1))
+		return t2;
+	if ((t1 - t2) <= 0)
+		return t1;
+	else
+		return t2;
+}
+
 /* return the number of ticks remaining from <now> to <exp>, or zero if expired */
 static inline int tick_remain(int now, int exp)
 {
diff --git a/include/proto/task.h b/include/proto/task.h
index 67eb924..b5f2280 100644
--- a/include/proto/task.h
+++ b/include/proto/task.h
@@ -246,9 +246,27 @@
 }
 
 /* Place <task> into the wait queue, where it may already be. If the expiration
- * timer is infinite, the task is dequeued.
+ * timer is infinite, do nothing and rely on wake_expired_task to clean up.
  */
-void task_queue(struct task *task);
+void __task_queue(struct task *task);
+static inline void task_queue(struct task *task)
+{
+	/* If we already have a place in the wait queue no later than the
+	 * timeout we're trying to set, we'll stay there, because it is very
+	 * unlikely that we will reach the timeout anyway. If the timeout
+	 * has been disabled, it's useless to leave the queue as well. We'll
+	 * rely on wake_expired_tasks() to catch the node and move it to the
+	 * proper place should it ever happen. Finally we only add the task
+	 * to the queue if it was not there or if it was further than what
+	 * we want.
+	 */
+	if (!tick_isset(task->expire))
+		return;
+
+	if (((tick_to_timer(task->expire) - task->wq.key) & TIMER_SIGN_BIT)
+		|| !task_in_wq(task))
+		__task_queue(task);
+}
 
 /*
  * This does 4 things :
diff --git a/src/task.c b/src/task.c
index f054e31..e91a26b 100644
--- a/src/task.c
+++ b/src/task.c
@@ -18,6 +18,7 @@
 #include <common/time.h>
 
 #include <proto/proxy.h>
+#include <proto/session.h>
 #include <proto/task.h>
 
 struct pool_head *pool2_task;
@@ -62,34 +63,23 @@
 }
 
 /*
- * task_queue()
+ * __task_queue()
  *
  * Inserts a task into the wait queue at the position given by its expiration
  * date. It does not matter if the task was already in the wait queue or not,
- * and it may even help if its position has not changed because we'll be able
- * to return without doing anything. Tasks queued with an eternity expiration
- * are just unlinked from the WQ. Last, tasks must not be queued further than
- * the end of the next tree, which is between <now_ms> and <now_ms> +
- * TIMER_SIGN_BIT ms (now+12days..24days in 32bit).
+ * as it will be unlinked. The task must not have an infinite expiration timer.
+ * Last, tasks must not be queued further than the end of the next tree, which
+ * is between <now_ms> and <now_ms> + TIMER_SIGN_BIT ms (now+12days..24days in
+ * 32bit).
+ *
+ * This function should not be used directly, it is meant to be called by the
+ * inline version of task_queue() which performs a few cheap preliminary tests
+ * before deciding to call __task_queue().
  */
-void task_queue(struct task *task)
+void __task_queue(struct task *task)
 {
-	/* if the task is already in the wait queue, we may reuse its position
-	 * or we will at least have to unlink it first.
-	 */
-	if (task_in_wq(task)) {
-		/* If we already have a place in the wait queue no later than the
-		 * timeout we're trying to set, we'll stay there, because it is very
-		 * unlikely that we will reach the timeout anyway. If the timeout
-		 * has been disabled, it's useless to leave the queue as well. We'll
-		 * rely on wake_expired_tasks() to catch the node and move it to the
-		 * proper place should it ever happen.
-		 */
-		if (!tick_isset(task->expire) ||
-		    ((task->wq.key - tick_to_timer(task->expire)) & TIMER_SIGN_BIT))
-			return;
+	if (likely(task_in_wq(task)))
 		__task_unlink_wq(task);
-	}
 
 	/* the task is not in the queue now */
 	if (unlikely(!tick_isset(task->expire)))
@@ -104,8 +94,8 @@
 
 	if (likely(last_timer &&
 		   last_timer->wq.key == task->wq.key &&
-		   last_timer->wq.node.node_p &&
-		   last_timer->wq.node.bit == -1)) {
+		   last_timer->wq.node.bit == -1 &&
+		   last_timer->wq.node.node_p)) {
 		/* Most often, last queued timer has the same expiration date, so
 		 * if it's not queued at the root, let's queue a dup directly there.
 		 * Note that we can only use dups at the dup tree's root (bit==-1).
@@ -145,7 +135,7 @@
 		eb = eb32_first(&timers[tree]);
 		while (eb) {
 			task = eb32_entry(eb, struct task, wq);
-			if ((tick_to_timer(now_ms) - eb->key) & TIMER_SIGN_BIT) {
+			if (likely((tick_to_timer(now_ms) - eb->key) & TIMER_SIGN_BIT)) {
 				/* note that we don't need this check for the <previous>
 				 * tree, but it's cheaper than duplicating the code.
 				 */
@@ -221,6 +211,10 @@
 	do {
 		eb = eb32_first(&rqueue[tree]);
 		while (eb) {
+			/* Note: this loop is one of the fastest code path in
+			 * the whole program. It should not be re-arranged
+			 * without a good reason.
+			 */
 			t = eb32_entry(eb, struct task, rq);
 
 			/* detach the task from the queue and add the task to the run queue */
@@ -228,10 +222,20 @@
 			__task_unlink_rq(t);
 
 			t->state |= TASK_RUNNING;
-			if (likely(t->process(t) != NULL)) {
+			/* This is an optimisation to help the processor's branch
+			 * predictor take this most common call.
+			 */
+			if (likely(t->process == process_session))
+				t = process_session(t);
+			else
+				t = t->process(t);
+
+			if (likely(t != NULL)) {
 				t->state &= ~TASK_RUNNING;
-				expire = tick_first(expire, t->expire);
-				task_queue(t);
+				if (t->expire) {
+					task_queue(t);
+					expire = tick_first_2nz(expire, t->expire);
+				}
 			}
 
 			if (!--max_processed)