MEDIUM: tasks: collect per-task CPU time and latency

Right now we measure for each task the cumulated time spent waiting for
the CPU and using it. The timestamp uses a 64-bit integer to report a
nanosecond-level date. This is only enabled when "profiling.tasks" is
enabled, and consumes less than 1% extra CPU on x86_64 when enabled.
The cumulated processing time and wait time are reported in "show sess".

The task's counters are also reset when an HTTP transaction is reset
since the HTTP part pretends to restart on a fresh new stream. This
will make sure we always report correct numbers for each request in
the logs.
diff --git a/src/proto_http.c b/src/proto_http.c
index f7222cd..019556c 100644
--- a/src/proto_http.c
+++ b/src/proto_http.c
@@ -3809,6 +3809,12 @@
 	stream_stop_content_counters(s);
 	stream_update_time_stats(s);
 
+	/* reset the profiling counter */
+	s->task->calls     = 0;
+	s->task->cpu_time  = 0;
+	s->task->lat_time  = 0;
+	s->task->call_date = (profiling & HA_PROF_TASKS) ? now_mono_time() : 0;
+
 	s->logs.accept_date = date; /* user-visible date for logging */
 	s->logs.tv_accept = now;  /* corrected date for internal use */
 	s->logs.t_handshake = 0; /* There are no handshake in keep alive connection. */
diff --git a/src/stream.c b/src/stream.c
index 3c84cbe..96e7f88 100644
--- a/src/stream.c
+++ b/src/stream.c
@@ -2984,14 +2984,15 @@
 		}
 		else if ((tmpctx = objt_appctx(strm->si[0].end)) != NULL) {
 			chunk_appendf(&trash,
-			              "  app0=%p st0=%d st1=%d st2=%d applet=%s tmask=0x%lx nice=%d calls=%u\n",
+			              "  app0=%p st0=%d st1=%d st2=%d applet=%s tmask=0x%lx nice=%d calls=%u cpu=%llu lat=%llu\n",
 				      tmpctx,
 				      tmpctx->st0,
 				      tmpctx->st1,
 				      tmpctx->st2,
 			              tmpctx->applet->name,
 			              tmpctx->thread_mask,
-			              tmpctx->t->nice, tmpctx->t->calls);
+			              tmpctx->t->nice, tmpctx->t->calls,
+			              (unsigned long long)tmpctx->t->cpu_time, (unsigned long long)tmpctx->t->lat_time);
 		}
 
 		if ((cs = objt_cs(strm->si[1].end)) != NULL) {
@@ -3018,14 +3019,15 @@
 		}
 		else if ((tmpctx = objt_appctx(strm->si[1].end)) != NULL) {
 			chunk_appendf(&trash,
-			              "  app1=%p st0=%d st1=%d st2=%d applet=%s tmask=0x%lx, nice=%d, calls=%u\n",
+			              "  app1=%p st0=%d st1=%d st2=%d applet=%s tmask=0x%lx, nice=%d, calls=%u, cpu=%llu, lat=%llu\n",
 				      tmpctx,
 				      tmpctx->st0,
 				      tmpctx->st1,
 				      tmpctx->st2,
 			              tmpctx->applet->name,
 			              tmpctx->thread_mask,
-			              tmpctx->t->nice, tmpctx->t->calls);
+			              tmpctx->t->nice, tmpctx->t->calls,
+			              (unsigned long long)tmpctx->t->cpu_time, (unsigned long long)tmpctx->t->lat_time);
 		}
 
 		chunk_appendf(&trash,
@@ -3225,10 +3227,11 @@
 			}
 
 			chunk_appendf(&trash,
-				     " ts=%02x age=%s calls=%d",
+				     " ts=%02x age=%s calls=%d cpu=%llu lat=%llu",
 				     curr_strm->task->state,
 				     human_time(now.tv_sec - curr_strm->logs.tv_accept.tv_sec, 1),
-				     curr_strm->task->calls);
+			             curr_strm->task->calls,
+			             (unsigned long long)curr_strm->task->cpu_time, (unsigned long long)curr_strm->task->lat_time);
 
 			chunk_appendf(&trash,
 				     " rq[f=%06xh,i=%u,an=%02xh,rx=%s",
diff --git a/src/task.c b/src/task.c
index 779dbf5..5c5e104 100644
--- a/src/task.c
+++ b/src/task.c
@@ -140,6 +140,9 @@
 		t->rq.key += offset;
 	}
 
+	if (profiling & HA_PROF_TASKS)
+		t->call_date = now_mono_time();
+
 	eb32sc_insert(root, &t->rq, t->thread_mask);
 #ifdef USE_THREAD
 	if (root == &rqueue) {
@@ -416,6 +419,14 @@
 		ctx = t->context;
 		process = t->process;
 		t->calls++;
+
+		if (unlikely(!TASK_IS_TASKLET(t) && t->call_date)) {
+			uint64_t now_ns = now_mono_time();
+
+			t->lat_time += now_ns - t->call_date;
+			t->call_date = now_ns;
+		}
+
 		curr_task = (struct task *)t;
 		if (likely(process == process_stream))
 			t = process_stream(t, ctx, state);
@@ -432,6 +443,11 @@
 		 * immediately, else we defer it into wait queue
 		 */
 		if (t != NULL) {
+			if (unlikely(!TASK_IS_TASKLET(t) && t->call_date)) {
+				t->cpu_time += now_mono_time() - t->call_date;
+				t->call_date = 0;
+			}
+
 			state = HA_ATOMIC_AND(&t->state, ~TASK_RUNNING);
 			if (state)
 #ifdef USE_THREAD